diff --git a/.gitignore b/.gitignore
index 640ce54..1bc38d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -200,4 +200,5 @@ nvidia_curobo*/
 
 *tfevents*
 
-*.csv
\ No newline at end of file
+*.csv
+docs/*
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6ddb098..f9e37da 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,12 +9,62 @@ without an express license agreement from NVIDIA CORPORATION or
 its affiliates is strictly prohibited.
 -->
 # Changelog
-## Latest Commit
-### BugFixes
-- refactored wp.index() instances to `[]` to avoid errors in using with warp-lang>=0.11.
-- Fixed bug in gaussian transformation to ensure values are not -1 or +1.
+
+## Version 0.6.3
+### Changes in default behavior
+- Increased default collision cache to 50 in RobotWorld.
+- Changed `CSpaceConfig.position_limit_clip` default to 0 as previous default of 0.01 can make 
+default start state in examples be out of bounds. 
+- MotionGen uses parallel_finetune by default. To get previous motion gen behavior, pass 
+`warmup(parallel_finetune=False)` and `MotionGenPlanConfig(parallel_finetune=False)`.
+- MotionGen loads Mesh Collision checker instead of Primitive by default.
+
+### Breaking Changes
+- Renamed `copy_if_not_none` to `clone_if_not_none` to be more descriptive. Now `copy_if_not_none`
+will try to copy data into reference.
+- Renamed `n_envs` in curobo.opt module to avoid confusion between parallel environments and 
+parallel problems in optimization.
+- Added more inputs to pose distance kernels. Check `curobolib/geom.py`.
+- Pose cost `run_vec_weight` should now be `[0,0,0,0,0,0]` instead of `[1,1,1,1,1,1]`
+
+### New Features
+- Add function to disable and enable collision for specific links in KinematicsTensorConfig. 
+- Add goal index to reacher results to return index of goal reached when goalset planning.
+- Add locked joint state update api in MotionGen class.
+- Add goalset warmup padding to handle varied number of goals during goalset planning and also when
+calling plan_single after warmup of goalset. 
+- Add new trajopt config to allow for smooth solutions at slow speeds (`velocity_scale<=0.25`). Also
+add error when `velocity_scale<0.1`.
+- Add experimental robot image segmentation module to enable robot removal in depth images.
+- Add constrained planning mode to motion_gen.
+
+### BugFixes & Misc.
+- refactored wp.index() instances to `[]` to avoid errors in future releases of warp.
+- Fix bug in gaussian transformation to ensure values are not -1 or +1.
+- Fix bug in ik_solver loading ee_link_name from argument.
+- Fix bug in batch_goalset planning, where pose cost was selected as GOALSET instead of 
+BATCH_GOALSET.
+- Added package data to also export `.so` files.
 - Fixed bug in transforming link visual mesh offset when reading from urdf. 
 - Fixed bug in MotionGenPlanConfig.clone() that didn't clone the state of parallel_finetune.
+- Increased weighting from 1.0 to 5.0 for optimized_dt in TrajEvaluator to select shorter 
+trajectories.
+- Improved determinism by setting global seed for random in `graph_nx.py`.
+- Added option to clear obstacles in WorldPrimitiveCollision.
+- Raise error when reference of tensors change in MotionGen, IKSolver, and TrajOpt when cuda graph
+is enabled.
+- plan_single will get converted to plan_goalset when a plan_goalset was used to initialize cuda 
+graph.
+- plan_goalset will pad for extra goals when called with less number of goal than initial creation.
+- Improved API documentation for Optimizer class.
+- Improved benchmark timings, now within 15ms of results reported in technical report. Added
+numbers to benchmark [webpage](https://curobo.org/source/getting_started/4_benchmarks.html) for 
+easy reference.
+- Set `use_cuda_graph` to `True` as default from `None` in `MotionGenConfig.load_from_robot_config`
+
+### Known Bugs (WIP)
+- Examples don't run in Isaac Sim 2023.1.1 due to behavior change in urdf importer.
+
 ## Version 0.6.2
 ### New Features
 - Added support for actuated axis to be negative (i.e., urdf joints with `<axis xyz="0 -1 0"/>` are
@@ -35,6 +85,7 @@ Run `benchmark/ik_benchmark.py` to get the latest results.
 - Added `external_asset_path` to robot configuration to help in loading urdf and meshes from an 
 external directory.
 
+
 ### BugFixes & Misc.
 - Update nvblox wrappers to work with v0.0.5 without segfaults. Significantly improves stability.
 - Remove mimic joints in franka panda to maintain compatibility with Isaac Sim 2023.1.0 and 2022.2.1
@@ -60,6 +111,7 @@ planning time, 20ms slower on 4090 and 40ms on ORIN MAXN. We will address this s
 release. One way to avoid this regression is to set `finetune_dt_scale=1.05` in 
 `MotionGenConfig.load_from_robot_config()`.
 
+
 ## Version 0.6.1
 
 - Added changes to `examples/isaac_sim` to support Isaac Sim 2023.1.0
diff --git a/MANIFEST.in b/MANIFEST.in
index 2f36d88..0ee060b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -15,4 +15,5 @@
 
 # graft <path to files or directories to add to the project>
 graft src/curobo/content
-global-include py.typed
\ No newline at end of file
+global-include py.typed
+graft src/curobo/curobolib/*.so
\ No newline at end of file
diff --git a/benchmark/README.md b/benchmark/README.md
index f1e39dc..77c4937 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -12,10 +12,47 @@ This folder contains scripts to run the motion planning benchmarks.
 
 Refer to Benchmarks & Profiling instructions: https://curobo.org/source/getting_started/4_benchmarks.html.
 
-### Note
-
 Results in the arxiv paper were obtained from v0.6.0. 
 
-v0.6.2+ has significant changes to improve motion quality with lower motion time, lower path length, higher pose accuracy (<1mm). v0.6.2+ sacrifices 30ms (RTX 4090) of compute time to achieve signficantly better solutions. The new results are yet to be added to the technical report. For now, to get the latest benchmark results follow instructions here: https://curobo.org/source/getting_started/4_benchmarks.html.
+v0.6.2+ has significant changes to improve motion quality with lower motion time, lower path length, higher pose accuracy (<1mm). v0.6.2+ sacrifices 15ms (RTX 4090) of compute time to achieve signficantly better solutions. The new results are yet to be added to the technical report. For now, to get the latest benchmark results follow instructions here: https://curobo.org/source/getting_started/4_benchmarks.html.
 
-To get results similar to in the technical report, pass `--report_edition` to `curobo_benchmark.py`. 
\ No newline at end of file
+To get results similar to in the technical report, pass `--report_edition` to `curobo_benchmark.py`. 
+
+
+# Latest Results (Feb 2024)
+
+Motion Generation on 2600 problems from motion benchmaker and motion policy networks, gives the 
+following values on a RTX 4090:
+
+| Metric            | Value                                                        |
+|-------------------|--------------------------------------------------------------|
+|Success %          | 99.84                                                        |
+|Plan Time (s)      | mean: 0.068 ± 0.158 median:0.042 75%: 0.055 98%: 0.246       |
+|Motion Time (s)    | mean: 1.169 ± 0.360 median:1.140 75%: 1.381 98%: 2.163       |
+|Path Length (rad.) | mean: 3.177 ± 1.072 median:3.261 75%: 3.804 98%: 5.376       |
+|Jerk               | mean: 97.700 ± 48.630 median:88.993 75%: 126.092 98%: 199.540|
+|Position Error (mm)| mean: 0.119 ± 0.341 median:0.027 75%: 0.091 98%: 1.039       |
+
+
+## Motion Benchmaker (800 problems):
+
+| Metric            | Value                                                        |
+|-------------------|--------------------------------------------------------------|
+|Success %          | 100                                                          |
+|Plan Time (s)      | mean: 0.063 ± 0.137 median:0.042 75%: 0.044 98%: 0.206       |
+|Motion Time (s)    | mean: 1.429 ± 0.330 median:1.392 75%: 1.501 98%: 2.473       |
+|Path Length (rad.) | mean: 3.956 ± 0.783 median:3.755 75%: 4.352 98%: 6.041       |
+|Jerk               | mean: 67.284 ± 27.788 median:61.853 75%: 83.337 98%: 143.118 |
+|Position Error (mm)| mean: 0.079 ± 0.139 median:0.032 75%: 0.077 98%: 0.472       |
+
+
+## Motion Policy Networks (1800 problems):
+
+| Metric            | Value                                                           |
+|-------------------|-----------------------------------------------------------------|
+|Success %          | 99.77                                                           |
+|Plan Time (s)      | mean: 0.068 ± 0.117 median:0.042 75%: 0.059 98%: 0.243          |
+|Motion Time (s)    | mean: 1.051 ± 0.306 median:1.016 75%: 1.226 98%: 1.760          |
+|Path Length (rad.) | mean: 2.829 ± 1.000 median:2.837 75%: 3.482 98%: 4.905          |
+|Jerk               | mean: 110.610 ± 47.586 median:105.271 75%: 141.517 98%: 217.158 |
+|Position Error (mm)| mean: 0.122 ± 0.343 median:0.024 75%: 0.095 98%: 1.114          |
diff --git a/benchmark/curobo_benchmark.py b/benchmark/curobo_benchmark.py
index d3bd0c7..944282b 100644
--- a/benchmark/curobo_benchmark.py
+++ b/benchmark/curobo_benchmark.py
@@ -10,12 +10,15 @@
 #
 
 # Standard Library
+import argparse
+import random
 from copy import deepcopy
 from typing import Optional
 
 # Third Party
 import numpy as np
 import torch
+from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
 from tqdm import tqdm
 
 # CuRobo
@@ -26,6 +29,7 @@ from curobo.types.math import Pose
 from curobo.types.robot import RobotConfig
 from curobo.types.state import JointState
 from curobo.util.logger import setup_curobo_logger
+from curobo.util.metrics import CuroboGroupMetrics, CuroboMetrics
 from curobo.util_file import (
     get_assets_path,
     get_robot_configs_path,
@@ -36,28 +40,16 @@ from curobo.util_file import (
 )
 from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
 
-# torch.set_num_threads(8)
-# torch.use_deterministic_algorithms(True)
-torch.manual_seed(0)
+# set seeds
+torch.manual_seed(2)
 
 torch.backends.cudnn.benchmark = True
 
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
 
-# torch.backends.cuda.matmul.allow_tf32 = False
-# torch.backends.cudnn.allow_tf32 = False
-
-# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-np.random.seed(0)
-# Standard Library
-import argparse
-import warnings
-from typing import List, Optional
-
-# Third Party
-from metrics import CuroboGroupMetrics, CuroboMetrics
-from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
+np.random.seed(2)
+random.seed(2)
 
 
 def plot_cost_iteration(cost: torch.Tensor, save_path="cost", title="", log_scale=False):
@@ -174,7 +166,7 @@ def load_curobo(
     world_cfg = WorldConfig.from_dict(
         load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
     ).get_obb_world()
-    interpolation_steps = 2000
+    interpolation_steps = 1000
     c_checker = CollisionCheckerType.PRIMITIVE
     c_cache = {"obb": n_cubes}
     if mesh_mode:
@@ -189,7 +181,7 @@ def load_curobo(
     K = robot_cfg_instance.kinematics.kinematics_config.joint_limits
     K.position[0, :] -= 0.2
     K.position[1, :] += 0.2
-    finetune_iters = None
+    finetune_iters = 300
     grad_iters = None
     if args.report_edition:
         finetune_iters = 200
@@ -215,7 +207,7 @@ def load_curobo(
         collision_activation_distance=collision_activation_distance,
         trajopt_dt=0.25,
         finetune_dt_scale=finetune_dt_scale,
-        maximum_trajectory_dt=0.1,
+        maximum_trajectory_dt=0.16,
     )
     mg = MotionGen(motion_gen_config)
     mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=parallel_finetune)
@@ -238,7 +230,7 @@ def benchmark_mb(
     interpolation_dt = 0.02
     # mpinets_data = True
     # if mpinets_data:
-    file_paths = [motion_benchmaker_raw, mpinets_raw][:]  # [1:]
+    file_paths = [motion_benchmaker_raw, mpinets_raw][:]
     if args.demo:
         file_paths = [demo_raw]
 
@@ -269,13 +261,13 @@ def benchmark_mb(
             if "cage_panda" in key:
                 trajopt_seeds = 16
                 finetune_dt_scale = 0.95
-                collision_activation_distance = 0.01
                 parallel_finetune = True
             if "table_under_pick_panda" in key:
-                tsteps = 44
+                tsteps = 40
                 trajopt_seeds = 24
-                finetune_dt_scale = 0.98
+                finetune_dt_scale = 0.95
                 parallel_finetune = True
+
             if "table_pick_panda" in key:
                 collision_activation_distance = 0.005
 
@@ -297,15 +289,19 @@ def benchmark_mb(
                 "cubby_neutral_start",
                 "cubby_neutral_goal",
                 "dresser_neutral_start",
+                "dresser_neutral_goal",
                 "tabletop_task_oriented",
             ]:
                 collision_activation_distance = 0.005
             if key in ["dresser_neutral_goal"]:
                 trajopt_seeds = 24  # 24
-                tsteps = 36
-                collision_activation_distance = 0.005
-            scene_problems = problems[key]
+                # tsteps = 36
+                collision_activation_distance = 0.01
+            # trajopt_seeds = 12
+            scene_problems = problems[key]  # [:10]
             n_cubes = check_problems(scene_problems)
+            # print(n_cubes)
+            # continue
             mg, robot_cfg = load_curobo(
                 n_cubes,
                 enable_debug,
@@ -327,11 +323,10 @@ def benchmark_mb(
             for problem in tqdm(scene_problems, leave=False):
                 i += 1
                 if problem["collision_buffer_ik"] < 0.0:
-                    # print("collision_ik:", problem["collision_buffer_ik"])
                     continue
 
                 plan_config = MotionGenPlanConfig(
-                    max_attempts=100,  # 100,  # 00,  # 00,  # 100,  # 00,  # 000,#,00,#00,  # 5000,
+                    max_attempts=100,
                     enable_graph_attempt=1,
                     disable_graph_attempt=20,
                     enable_finetune_trajopt=True,
@@ -349,24 +344,13 @@ def benchmark_mb(
                 problem_name = "d_" + key + "_" + str(i)
 
                 # reset planner
-                mg.reset(reset_seed=False)
+                mg.reset(reset_seed=True)
                 if args.mesh:
                     world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_mesh_world()
                 else:
                     world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_obb_world()
                 mg.world_coll_checker.clear_cache()
                 mg.update_world(world)
-                # from curobo.geom.types import Cuboid as curobo_Cuboid
-
-                # coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box(
-                #    curobo_Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]),
-                #    voxel_size=0.01,
-                # )
-                #
-                # coll_mesh.save_as_mesh(problem_name + "debug_curobo.obj")
-                # exit()
-                # continue
-                # load obstacles
 
                 # run planner
                 start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
@@ -379,7 +363,6 @@ def benchmark_mb(
                 )
                 if result.status == "IK Fail":
                     ik_fail += 1
-                # rint(plan_config.enable_graph, plan_config.enable_graph_attempt)
                 problem["solution"] = None
                 problem_name = key + "_" + str(i)
 
@@ -432,7 +415,6 @@ def benchmark_mb(
                                 log_scale=False,
                             )
                 if result.success.item():
-                    # print("GT: ", result.graph_time)
                     q_traj = result.get_interpolated_plan()
                     problem["goal_ik"] = q_traj.position.cpu().squeeze().numpy()[-1, :].tolist()
                     problem["solution"] = {
@@ -458,8 +440,7 @@ def benchmark_mb(
                         .tolist(),
                         "dt": interpolation_dt,
                     }
-                    # print(problem["solution"]["position"])
-                    # exit()
+
                     debug = {
                         "used_graph": result.used_graph,
                         "attempts": result.attempts,
@@ -487,14 +468,7 @@ def benchmark_mb(
                         "valid_query": result.valid_query,
                     }
                     problem["solution_debug"] = debug
-                    # print(
-                    #    "T: ",
-                    #    result.motion_time.item(),
-                    #    result.optimized_dt.item(),
-                    #    (len(problem["solution"]["position"]) - 1 ) * result.interpolation_dt,
-                    #    result.interpolation_dt,
-                    #    )
-                    # exit()
+
                     reached_pose = mg.compute_kinematics(result.optimized_plan[-1]).ee_pose
                     rot_error = goal_pose.angular_distance(reached_pose) * 100.0
                     if args.graph:
@@ -526,6 +500,7 @@ def benchmark_mb(
                         motion_time=result.motion_time.item(),
                         solve_time=solve_time,
                         cspace_path_length=path_length,
+                        jerk=torch.max(torch.abs(result.optimized_plan.jerk)).item(),
                     )
 
                     if write_usd:
@@ -552,20 +527,9 @@ def benchmark_mb(
                         plot_traj(
                             result.optimized_plan,
                             result.optimized_dt.item(),
-                            # result.get_interpolated_plan(),
-                            # result.interpolation_dt,
                             title=problem_name,
                             save_path=join_path("benchmark/log/plot/", problem_name + ".png"),
                         )
-                        # plot_traj(
-                        #    # result.optimized_plan,
-                        #    # result.optimized_dt.item(),
-                        #    result.get_interpolated_plan(),
-                        #    result.interpolation_dt,
-                        #    title=problem_name,
-                        #    save_path=join_path("benchmark/log/plot/", problem_name + "_int.pdf"),
-                        # )
-                        # exit()
 
                     m_list.append(current_metrics)
                     all_groups.append(current_metrics)
@@ -587,7 +551,6 @@ def benchmark_mb(
                     m_list.append(current_metrics)
                     all_groups.append(current_metrics)
                 else:
-                    # print("invalid: " + problem_name)
                     debug = {
                         "used_graph": result.used_graph,
                         "attempts": result.attempts,
@@ -601,28 +564,7 @@ def benchmark_mb(
                     }
 
                     problem["solution_debug"] = debug
-                    if False:
-                        world.save_world_as_mesh(problem_name + ".obj")
-
-                        q_traj = start_state  # .unsqueeze(0)
-                        # CuRobo
-                        from curobo.util.usd_helper import UsdHelper
-
-                        UsdHelper.write_trajectory_animation_with_robot_usd(
-                            robot_cfg,
-                            world,
-                            start_state,
-                            q_traj,
-                            dt=result.interpolation_dt,
-                            save_path=join_path("benchmark/log/usd/", problem_name) + ".usd",
-                            interpolation_steps=1,
-                            write_robot_usd_path="benchmark/log/usd/assets/",
-                            robot_usd_local_reference="assets/",
-                            base_frame="/world_" + problem_name,
-                            visualize_robot_spheres=True,
-                        )
                 if save_log and not result.success.item():
-                    # print("save log")
                     UsdHelper.write_motion_gen_log(
                         result,
                         robot_cfg,
@@ -637,8 +579,7 @@ def benchmark_mb(
                         write_robot_usd_path="benchmark/log/usd/assets/",
                         # flatten_usd=True,
                     )
-                #    print(result.status)
-                #    exit()
+                    print(result.status)
 
             g_m = CuroboGroupMetrics.from_list(m_list)
             if not args.kpi:
@@ -653,10 +594,6 @@ def benchmark_mb(
                     g_m.motion_time.percent_98,
                 )
                 print(g_m.attempts)
-            # print("MT: ", g_m.motion_time)
-            # $print(ik_fail)
-            # exit()
-            # print(g_m.position_error, g_m.orientation_error)
 
         g_m = CuroboGroupMetrics.from_list(all_groups)
         if not args.kpi:
@@ -668,12 +605,8 @@ def benchmark_mb(
                 g_m.time.percent_75,
                 g_m.position_error.percent_75,
                 g_m.orientation_error.percent_75,
-            )  # g_m.time, g_m.attempts)
-        # print("MT: ", g_m.motion_time)
+            )
 
-        # print(g_m.position_error, g_m.orientation_error)
-
-        # exit()
         if write_benchmark:
             if not mpinets_data:
                 write_yaml(problems, args.file_name + "_mb_solution.yaml")
@@ -681,7 +614,6 @@ def benchmark_mb(
                 write_yaml(problems, args.file_name + "_mpinets_solution.yaml")
         all_files += all_groups
     g_m = CuroboGroupMetrics.from_list(all_files)
-    # print(g_m.success, g_m.time, g_m.attempts, g_m.position_error, g_m.orientation_error)
     print("######## FULL SET ############")
     print("All: ", f"{g_m.success:2.2f}")
     print("MT: ", g_m.motion_time)
@@ -690,6 +622,7 @@ def benchmark_mb(
     print("ST: ", g_m.solve_time)
     print("position error (mm): ", g_m.position_error)
     print("orientation error(%): ", g_m.orientation_error)
+    print("jerk: ", g_m.jerk)
 
     if args.kpi:
         kpi_data = {
@@ -702,8 +635,6 @@ def benchmark_mb(
         }
         write_yaml(kpi_data, join_path(args.save_path, args.file_name + ".yml"))
 
-    # run on mb dataset:
-
 
 def check_problems(all_problems):
     n_cube = 0
@@ -801,12 +732,13 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     setup_curobo_logger("error")
-    benchmark_mb(
-        save_log=False,
-        write_usd=args.save_usd,
-        write_plot=args.save_plot,
-        write_benchmark=args.write_benchmark,
-        plot_cost=False,
-        graph_mode=args.graph,
-        args=args,
-    )
+    for _ in range(1):
+        benchmark_mb(
+            save_log=False,
+            write_usd=args.save_usd,
+            write_plot=args.save_plot,
+            write_benchmark=args.write_benchmark,
+            plot_cost=False,
+            graph_mode=args.graph,
+            args=args,
+        )
diff --git a/benchmark/curobo_nvblox_benchmark.py b/benchmark/curobo_nvblox_benchmark.py
index cefb69c..3e6ffde 100644
--- a/benchmark/curobo_nvblox_benchmark.py
+++ b/benchmark/curobo_nvblox_benchmark.py
@@ -18,7 +18,6 @@ from typing import Optional
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-from metrics import CuroboGroupMetrics, CuroboMetrics
 from nvblox_torch.datasets.mesh_dataset import MeshDataset
 from nvblox_torch.datasets.sun3d_dataset import Sun3dDataset
 from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
@@ -34,6 +33,7 @@ from curobo.types.math import Pose
 from curobo.types.robot import RobotConfig
 from curobo.types.state import JointState
 from curobo.util.logger import setup_curobo_logger
+from curobo.util.metrics import CuroboGroupMetrics, CuroboMetrics
 from curobo.util_file import (
     get_assets_path,
     get_robot_configs_path,
@@ -126,11 +126,12 @@ def load_curobo(
     mpinets: bool = False,
     graph_mode: bool = False,
     cuda_graph: bool = True,
+    collision_activation_distance: float = 0.025,
 ):
     robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
-    robot_cfg["kinematics"]["collision_sphere_buffer"] = -0.0
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = 0.0
 
-    ik_seeds = 30  # 500
+    ik_seeds = 30
     if graph_mode:
         trajopt_seeds = 4
     if trajopt_seeds >= 14:
@@ -146,7 +147,7 @@ def load_curobo(
                 "world": {
                     "pose": [0, 0, 0, 1, 0, 0, 0],
                     "integrator_type": "tsdf",
-                    "voxel_size": 0.014,
+                    "voxel_size": 0.01,
                 }
             }
         }
@@ -174,16 +175,17 @@ def load_curobo(
         store_ik_debug=enable_debug,
         store_trajopt_debug=enable_debug,
         interpolation_steps=interpolation_steps,
-        collision_activation_distance=0.01,
+        collision_activation_distance=collision_activation_distance,
         trajopt_dt=0.25,
         finetune_dt_scale=1.0,
         maximum_trajectory_dt=0.1,
+        finetune_trajopt_iters=300,
     )
     mg = MotionGen(motion_gen_config)
-    mg.warmup(enable_graph=True, warmup_js_trajopt=False)
+    mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=True)
     # create a ground truth collision checker:
     config = RobotWorldConfig.load_from_config(
-        robot_cfg,
+        robot_cfg_instance,
         "collision_table.yml",
         collision_activation_distance=0.0,
         collision_checker_type=CollisionCheckerType.PRIMITIVE,
@@ -206,7 +208,7 @@ def benchmark_mb(
     # load dataset:
     graph_mode = args.graph
     interpolation_dt = 0.02
-    file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][1:]
+    file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][2:]
 
     enable_debug = save_log or plot_cost
     all_files = []
@@ -215,43 +217,73 @@ def benchmark_mb(
         og_tsteps = override_tsteps
 
     og_trajopt_seeds = 12
-
+    og_collision_activation_distance = 0.03  # 0.03
     if args.graph:
         og_trajopt_seeds = 4
     for file_path in file_paths:
         all_groups = []
         mpinets_data = False
         problems = file_path()
-        if "dresser_task_oriented" in list(problems.keys()):
-            mpinets_data = True
-
-        mg, robot_cfg, robot_world = load_curobo(
-            1,
-            enable_debug,
-            og_tsteps,
-            og_trajopt_seeds,
-            mpinets_data,
-            graph_mode,
-            not args.disable_cuda_graph,
-        )
 
         for key, v in tqdm(problems.items()):
             scene_problems = problems[key]
             m_list = []
-            i = 0
+            i = -1
             ik_fail = 0
+            trajopt_seeds = og_trajopt_seeds
+            tsteps = og_tsteps
+            collision_activation_distance = og_collision_activation_distance
+            if "dresser_task_oriented" in list(problems.keys()):
+                mpinets_data = True
+
+            if "cage_panda" in key:
+                trajopt_seeds = 16
+                finetune_dt_scale = 0.95
+            if "table_under_pick_panda" in key:
+                tsteps = 44
+                trajopt_seeds = 16
+                finetune_dt_scale = 0.98
+
+            if "cubby_task_oriented" in key and "merged" not in key:
+                trajopt_seeds = 24
+                finetune_dt_scale = 0.95
+                collision_activation_distance = 0.035
+            if "dresser_task_oriented" in key:
+                trajopt_seeds = 24
+                finetune_dt_scale = 0.95
+                collision_activation_distance = 0.035  # 0.035
+
+            if "merged_cubby_task_oriented" in key:
+                collision_activation_distance = 0.025  # 0.035
+            if "tabletop_task_oriented" in key:
+                collision_activation_distance = 0.025  # 0.035
+            if key in ["dresser_neutral_goal"]:
+                trajopt_seeds = 24
+                collision_activation_distance = og_collision_activation_distance
+            mg, robot_cfg, robot_world = load_curobo(
+                1,
+                enable_debug,
+                tsteps,
+                trajopt_seeds,
+                mpinets_data,
+                graph_mode,
+                not args.disable_cuda_graph,
+                collision_activation_distance=collision_activation_distance,
+            )
             for problem in tqdm(scene_problems, leave=False):
                 i += 1
+                if problem["collision_buffer_ik"] < 0.0:
+                    continue
 
                 plan_config = MotionGenPlanConfig(
-                    max_attempts=10,  # 00,  # 00,  # 100,  # 00,  # 000,#,00,#00,  # 5000,
-                    enable_graph_attempt=3,
-                    disable_graph_attempt=20,
+                    max_attempts=10,
+                    enable_graph_attempt=1,
                     enable_finetune_trajopt=True,
                     partial_ik_opt=False,
                     enable_graph=graph_mode,
                     timeout=60,
                     enable_opt=not graph_mode,
+                    parallel_finetune=True,
                 )
 
                 q_start = problem["start"]
@@ -265,23 +297,15 @@ def benchmark_mb(
                 mg.reset(reset_seed=False)
                 world = WorldConfig.from_dict(problem["obstacles"])
 
-                # .get_mesh_world(
-                #    # merge_meshes=True
-                # )
-                # mesh = world.mesh[0].get_trimesh_mesh()
-
-                # world.save_world_as_mesh(problem_name + ".stl")
+                mg.world_coll_checker.clear_cache()
                 mg.world_coll_checker.update_blox_hashes()
 
-                mg.world_coll_checker.clear_cache()
                 save_path = "benchmark/log/nvblox/" + key + "_" + str(i)
 
                 m_dataset = Sun3dDataset(save_path)
-                # m_dataset = MeshDataset(
-                #    None, n_frames=100, image_size=640, save_data_dir=None, trimesh_mesh=mesh
-                # )
+
                 tensor_args = mg.tensor_args
-                for j in tqdm(range(len(m_dataset)), leave=False):
+                for j in tqdm(range(min(1, len(m_dataset))), leave=False):
                     data = m_dataset[j]
                     cam_obs = CameraObservation(
                         rgb_image=tensor_args.to_device(data["rgba"])
@@ -300,35 +324,12 @@ def benchmark_mb(
                 torch.cuda.synchronize()
                 mg.world_coll_checker.update_blox_hashes()
                 torch.cuda.synchronize()
+                # if i > 2:
+                #    mg.world_coll_checker.clear_cache()
+                #    mg.world_coll_checker.update_blox_hashes()
+
                 # mg.world_coll_checker.save_layer("world", "test.nvblx")
 
-                if save_log or write_usd:
-                    world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
-
-                    nvblox_obs = mg.world_coll_checker.get_mesh_from_blox_layer(
-                        "world",
-                    )
-                    # nvblox_obs.vertex_colors = None
-
-                    if nvblox_obs.vertex_colors is not None:
-                        nvblox_obs.vertex_colors = nvblox_obs.vertex_colors.cpu().numpy()
-                    else:
-                        nvblox_obs.color = [0.0, 0.0, 0.8, 0.8]
-
-                    nvblox_obs.name = "nvblox_mesh_world"
-                    world.add_obstacle(nvblox_obs)
-
-                    coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box(
-                        curobo_Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1.5, 1.5, 1]),
-                        voxel_size=0.005,
-                    )
-
-                    coll_mesh.color = [0.0, 0.8, 0.8, 0.8]
-
-                    coll_mesh.name = "nvblox_voxel_world"
-                    world.add_obstacle(coll_mesh)
-                    # exit()
-                # run planner
                 start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
                 result = mg.plan_single(
                     start_state,
@@ -338,20 +339,6 @@ def benchmark_mb(
                 if result.status == "IK Fail":
                     ik_fail += 1
                 problem["solution"] = None
-                if write_usd or save_log:
-                    # CuRobo
-                    from curobo.util.usd_helper import UsdHelper
-
-                    gripper_mesh = Mesh(
-                        name="target_gripper_1",
-                        file_path=join_path(
-                            get_assets_path(),
-                            "robot/franka_description/meshes/visual/hand.dae",
-                        ),
-                        color=[0.0, 0.8, 0.1, 1.0],
-                        pose=pose,
-                    )
-                    world.add_obstacle(gripper_mesh)
 
                 # get costs:
                 if plot_cost:
@@ -437,9 +424,12 @@ def benchmark_mb(
                         "valid_query": result.valid_query,
                     }
                     problem["solution_debug"] = debug
+                    world_coll = WorldConfig.from_dict(problem["obstacles"]).get_obb_world()
+
                     # check if path is collision free w.r.t. ground truth mesh:
-                    robot_world.world_model.clear_cache()
-                    robot_world.update_world(world)
+                    # robot_world.world_model.clear_cache()
+                    robot_world.update_world(world_coll)
+
                     q_int_traj = result.get_interpolated_plan().position.unsqueeze(0)
                     d_int_mask = (
                         torch.count_nonzero(~robot_world.validate_trajectory(q_int_traj)) == 0
@@ -449,7 +439,14 @@ def benchmark_mb(
                     d_mask = (
                         torch.count_nonzero(~robot_world.validate_trajectory(q_traj)) == 0
                     ).item()
-
+                    # d_world, _ = robot_world.get_world_self_collision_distance_from_joint_trajectory(
+                    # q_traj)
+                    # thres_dist = robot_world.contact_distance
+                    # in_collision = d_world.squeeze(0) > thres_dist
+                    # d_mask = not torch.any(in_collision, dim=-1).item()
+                    # if not d_mask:
+                    #    write_usd = True
+                    #    #print(torch.max(d_world).item(), problem_name)
                     current_metrics = CuroboMetrics(
                         skip=False,
                         success=True,
@@ -466,10 +463,40 @@ def benchmark_mb(
                         attempts=result.attempts,
                         motion_time=result.motion_time.item(),
                         solve_time=result.solve_time,
+                        jerk=torch.max(torch.abs(result.optimized_plan.jerk)).item(),
                     )
 
+                    if save_log or write_usd:
+                        world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
+
+                        nvblox_obs = mg.world_coll_checker.get_mesh_from_blox_layer(
+                            "world",
+                        )
+                        # nvblox_obs.vertex_colors = None
+
+                        if nvblox_obs.vertex_colors is not None:
+                            nvblox_obs.vertex_colors = nvblox_obs.vertex_colors.cpu().numpy()
+                        else:
+                            nvblox_obs.color = [0.0, 0.0, 0.8, 0.8]
+
+                        nvblox_obs.name = "nvblox_mesh_world"
+                        world.add_obstacle(nvblox_obs)
+
+                        coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box(
+                            curobo_Cuboid(
+                                name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1.5, 1.5, 1]
+                            ),
+                            voxel_size=0.005,
+                        )
+
+                        coll_mesh.color = [0.0, 0.8, 0.8, 0.8]
+
+                        coll_mesh.name = "nvblox_voxel_world"
+                        world.add_obstacle(coll_mesh)
+                    # run planner
                     if write_usd:
                         # CuRobo
+                        from curobo.util.usd_helper import UsdHelper
 
                         q_traj = result.get_interpolated_plan()
                         UsdHelper.write_trajectory_animation_with_robot_usd(
@@ -486,7 +513,8 @@ def benchmark_mb(
                             visualize_robot_spheres=True,
                             # flatten_usd=True,
                         )
-                        exit()
+                        # write_usd = False
+                        # exit()
                     if write_plot:
                         problem_name = problem_name
                         plot_traj(
@@ -540,6 +568,9 @@ def benchmark_mb(
                     }
                     problem["solution_debug"] = debug
                 if save_log and not result.success.item():
+                    # CuRobo
+                    from curobo.util.usd_helper import UsdHelper
+
                     UsdHelper.write_motion_gen_log(
                         result,
                         robot_cfg,
@@ -553,7 +584,7 @@ def benchmark_mb(
                         grid_space=2,
                         # flatten_usd=True,
                     )
-                    # exit()
+                    exit()
             g_m = CuroboGroupMetrics.from_list(m_list)
             print(
                 key,
@@ -566,7 +597,7 @@ def benchmark_mb(
                 g_m.orientation_error.percent_98,
                 g_m.cspace_path_length.percent_98,
                 g_m.motion_time.percent_98,
-                g_m.perception_success,
+                g_m.perception_interpolated_success,
                 # g_m.orientation_error.median,
             )
             print(g_m.attempts)
@@ -600,6 +631,7 @@ def benchmark_mb(
     print("PT:", g_m.time)
     print("ST: ", g_m.solve_time)
     print("accuracy: ", g_m.position_error, g_m.orientation_error)
+    print("Jerk: ", g_m.jerk)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/curobo_profile.py b/benchmark/curobo_profile.py
index c155718..77cbcb2 100644
--- a/benchmark/curobo_profile.py
+++ b/benchmark/curobo_profile.py
@@ -73,8 +73,8 @@ def load_curobo(
         position_threshold=0.005,
         rotation_threshold=0.05,
         num_ik_seeds=30,
-        num_trajopt_seeds=10,
-        interpolation_dt=0.02,
+        num_trajopt_seeds=12,
+        interpolation_dt=0.025,
         # grad_trajopt_iters=200,
         store_ik_debug=enable_log,
         store_trajopt_debug=enable_log,
@@ -91,7 +91,7 @@ def benchmark_mb(args):
         spheres = load_yaml(join_path(get_robot_configs_path(), spheres))["collision_spheres"]
 
     plan_config = MotionGenPlanConfig(
-        max_attempts=2,
+        max_attempts=1,
         enable_graph_attempt=3,
         enable_finetune_trajopt=True,
         partial_ik_opt=False,
@@ -130,11 +130,18 @@ def benchmark_mb(args):
                     world = WorldConfig.from_dict(problem["obstacles"]).get_obb_world()
 
                 mg.update_world(world)
+                start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+
+                result = mg.plan_single(
+                    start_state,
+                    Pose.from_list(pose),
+                    plan_config,
+                )
+                print(result.total_time, result.solve_time)
                 # continue
                 # load obstacles
 
                 # run planner
-                start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
                 with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
                     result = mg.plan_single(
                         start_state,
diff --git a/benchmark/generate_nvblox_images.py b/benchmark/generate_nvblox_images.py
index 13b0f83..48681c0 100644
--- a/benchmark/generate_nvblox_images.py
+++ b/benchmark/generate_nvblox_images.py
@@ -43,7 +43,7 @@ def generate_images():
 
         for key, v in tqdm(problems.items()):
             scene_problems = problems[key]
-            i = 0
+            i = -1
             for problem in tqdm(scene_problems, leave=False):
                 i += 1
 
diff --git a/benchmark/metrics.py b/benchmark/metrics.py
index 751a4b5..b4cd4c3 100644
--- a/benchmark/metrics.py
+++ b/benchmark/metrics.py
@@ -29,6 +29,7 @@ class CuroboMetrics(TrajectoryMetrics):
     cspace_path_length: float = 0.0
     perception_success: bool = False
     perception_interpolated_success: bool = False
+    jerk: float = np.inf
 
 
 @dataclass
@@ -37,6 +38,7 @@ class CuroboGroupMetrics(TrajectoryGroupMetrics):
     cspace_path_length: Optional[Statistic] = None
     perception_success: float = 0.0
     perception_interpolated_success: float = 0.0
+    jerk: float = np.inf
 
     @classmethod
     def from_list(cls, group: List[CuroboMetrics]):
@@ -49,5 +51,6 @@ class CuroboGroupMetrics(TrajectoryGroupMetrics):
         data.perception_interpolated_success = percent_true(
             [m.perception_interpolated_success for m in group]
         )
+        data.jerk = Statistic.from_list([m.jerk for m in successes])
 
         return data
diff --git a/docker/aarch64.dockerfile b/docker/aarch64.dockerfile
index b7a2130..e1ee188 100644
--- a/docker/aarch64.dockerfile
+++ b/docker/aarch64.dockerfile
@@ -129,24 +129,15 @@ RUN pip3 install trimesh \
   empy
 
 # Add cache date to avoid using cached layers older than this
-ARG CACHE_DATE=2023-12-15 
-
-# warp from https://github.com/NVIDIA/warp needs to be compiled locally and then 
-# placed in curobo/docker/pkgs. 
-# Run the following from your terminal:
-# cd curobo/docker && mkdir pkgs && cd pkgs && git clone https://github.com/NVIDIA/warp.git
-# cd warp && python build_libs.py
-# 
-# copy pkgs directory:
-COPY pkgs /pkgs
+ARG CACHE_DATE=2024-02-20
 
 # install warp:
 # 
-RUN cd /pkgs/warp && pip3 install .
+RUN pip3 install warp-lang
 
 # install curobo:
 
-RUN cd /pkgs && git clone https://github.com/NVlabs/curobo.git
+RUN mkdir /pkgs && git clone https://github.com/NVlabs/curobo.git
 
 ENV TORCH_CUDA_ARCH_LIST "7.0+PTX"
 
@@ -157,6 +148,7 @@ WORKDIR /pkgs/curobo
 # Optionally install nvblox:
 ENV  PYOPENGL_PLATFORM=egl
 
+
 RUN apt-get update && \
     apt-get install -y libgoogle-glog-dev libgtest-dev curl libsqlite3-dev libbenchmark-dev && \
     cd /usr/src/googletest && cmake . && cmake --build . --target install && \
@@ -172,6 +164,7 @@ RUN cd /pkgs && git clone https://github.com/nvlabs/nvblox_torch.git && \
     cd nvblox_torch && \
     sh install.sh $(python3 -c 'import torch.utils; print(torch.utils.cmake_prefix_path)') && \
     python3 -m pip install -e .
+
   
 RUN python -m pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"
 
diff --git a/docker/user.dockerfile b/docker/user.dockerfile
index ad17095..50bc6dc 100644
--- a/docker/user.dockerfile
+++ b/docker/user.dockerfile
@@ -15,6 +15,7 @@ FROM curobo_docker:${IMAGE_TAG}
 # Set variables
 ARG USERNAME
 ARG USER_ID
+ARG CACHE_DATE=2024-02-20
 
 # Set environment variables
 
diff --git a/docker/x86.dockerfile b/docker/x86.dockerfile
index 9156293..4725bf3 100644
--- a/docker/x86.dockerfile
+++ b/docker/x86.dockerfile
@@ -78,7 +78,7 @@ ENV TORCH_CUDA_ARCH_LIST "7.0+PTX"
 ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
 
 # Add cache date to avoid using cached layers older than this
-ARG CACHE_DATE=2023-12-15 
+ARG CACHE_DATE=2024-02-20
 
 
 RUN pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"
diff --git a/examples/isaac_sim/constrained_reacher.py b/examples/isaac_sim/constrained_reacher.py
new file mode 100644
index 0000000..83f6041
--- /dev/null
+++ b/examples/isaac_sim/constrained_reacher.py
@@ -0,0 +1,355 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+
+# Third Party
+import numpy as np
+import torch
+from helper import add_extensions, add_robot_to_scene
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+from curobo.wrap.reacher.motion_gen import (
+    MotionGen,
+    MotionGenConfig,
+    MotionGenPlanConfig,
+    PoseCostMetric,
+)
+
+simulation_app.update()
+# Standard Library
+import argparse
+
+# Third Party
+import carb
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniGlass, OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.usd_helper import UsdHelper
+
+parser = argparse.ArgumentParser()
+
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+
+args = parser.parse_args()
+
+
+if __name__ == "__main__":
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+    n_obstacle_cuboids = 10
+    n_obstacle_mesh = 10
+
+    stage = my_world.stage
+    my_world.scene.add_default_ground_plane()
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    target_material = OmniPBR("/World/looks/t", color=np.array([0, 1, 0]))
+    target_material_2 = OmniPBR("/World/looks/t2", color=np.array([0, 1, 0]))
+    target_material_plane = OmniGlass(
+        "/World/looks/t3", color=np.array([0, 1, 0]), ior=1.25, depth=0.001, thin_walled=False
+    )
+    target_material_line = OmniGlass(
+        "/World/looks/t4", color=np.array([0, 1, 0]), ior=1.25, depth=0.001, thin_walled=True
+    )
+
+    # target_orient = [0,0,0.707,0.707]
+    target_orient = [0.5, -0.5, 0.5, 0.5]
+
+    target = cuboid.VisualCuboid(
+        "/World/target_1",
+        position=np.array([0.55, -0.3, 0.5]),
+        orientation=np.array(target_orient),
+        size=0.04,
+        visual_material=target_material,
+    )
+
+    # Make a target to follow
+    target_2 = cuboid.VisualCuboid(
+        "/World/target_2",
+        position=np.array([0.55, 0.4, 0.5]),
+        orientation=np.array(target_orient),
+        size=0.04,
+        visual_material=target_material_2,
+    )
+
+    x_plane = cuboid.VisualCuboid(
+        "/World/constraint_plane",
+        position=np.array([0.55, 0.05, 0.5]),
+        orientation=np.array(target_orient),
+        scale=[1.1, 0.001, 1.0],
+        visual_material=target_material_plane,
+    )
+    xz_line = cuboid.VisualCuboid(
+        "/World/constraint_line",
+        position=np.array([0.55, 0.05, 0.5]),
+        orientation=np.array(target_orient),
+        scale=[0.04, 0.04, 0.65],
+        visual_material=target_material_line,
+    )
+
+    collision_checker_type = CollisionCheckerType.BLOX
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world, "/World/world_robot/")
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+
+    world_cfg_table.cuboid[0].pose[2] -= 0.01
+    world_cfg1 = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_mesh_world()
+    world_cfg1.mesh[0].name += "_mesh"
+    world_cfg1.mesh[0].pose[2] = -10.5
+
+    world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+    usd_help = UsdHelper()
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg_table.get_mesh_world(), base_frame="/World")
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        collision_checker_type=CollisionCheckerType.MESH,
+        collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+        velocity_scale=0.75,
+        interpolation_dt=0.02,
+        ee_link_name="right_gripper",
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    print("warming up..")
+    motion_gen.warmup(warmup_js_trajopt=False)
+
+    world_model = motion_gen.world_collision
+
+    i = 0
+    tensor_args = TensorDeviceType()
+    target_list = [target, target_2]
+    target_material_list = [target_material, target_material_2]
+    for material in target_material_list:
+        material.set_color(np.array([0.1, 0.1, 0.1]))
+    target_material_plane.set_color(np.array([1, 1, 1]))
+    target_material_line.set_color(np.array([1, 1, 1]))
+
+    target_idx = 0
+    cmd_idx = 0
+    cmd_plan = None
+    articulation_controller = robot.get_articulation_controller()
+    plan_config = MotionGenPlanConfig(
+        enable_graph=False, enable_graph_attempt=4, max_attempts=2, enable_finetune_trajopt=True
+    )
+
+    plan_idx = 0
+    cmd_step_idx = 0
+    pose_cost_metric = None
+    while simulation_app.is_running():
+        my_world.step(render=True)
+
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+        step_index = my_world.current_time_step_index
+
+        if step_index <= 2:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+
+        if False and step_index % 50 == 0.0:  # No obstacle update
+            obstacles = usd_help.get_obstacles_from_stage(
+                # only_paths=[obstacles_path],
+                reference_prim_path=robot_prim_path,
+                ignore_substring=[
+                    robot_prim_path,
+                    "/World/target",
+                    "/World/defaultGroundPlane",
+                    "/curobo",
+                ],
+            ).get_collision_check_world()
+            # print(len(obstacles.objects))
+
+            motion_gen.update_world(obstacles)
+            # print("Updated World")
+            carb.log_info("Synced CuRobo world from stage.")
+
+        linear_color = np.ravel([249, 87, 56]) / 255.0
+        orient_color = np.ravel([103, 148, 54]) / 255.0
+
+        disable_color = np.ravel([255, 255, 255]) / 255.0
+
+        if cmd_plan is None and step_index % 10 == 0:
+
+            if plan_idx == 4:
+                print("Constrained: Holding tool linear-y")
+                pose_cost_metric = PoseCostMetric(
+                    hold_partial_pose=True,
+                    hold_vec_weight=motion_gen.tensor_args.to_device([0, 0, 0, 0, 1, 0]),
+                )
+                target_material_plane.set_color(linear_color)
+            if plan_idx == 8:
+                print("Constrained: Holding tool Orientation and linear-y")
+                pose_cost_metric = PoseCostMetric(
+                    hold_partial_pose=True,
+                    hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 0, 1, 0]),
+                )
+                target_material_plane.set_color(orient_color)
+            if plan_idx == 12:
+                print("Constrained: Holding tool linear-y, linear-x")
+                pose_cost_metric = PoseCostMetric(
+                    hold_partial_pose=True,
+                    hold_vec_weight=motion_gen.tensor_args.to_device([0, 0, 0, 1, 1, 0]),
+                )
+                target_material_line.set_color(linear_color)
+                target_material_plane.set_color(disable_color)
+
+            if plan_idx == 16:
+                print("Constrained: Holding tool Orientation and linear-y, linear-x")
+                pose_cost_metric = PoseCostMetric(
+                    hold_partial_pose=True,
+                    hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 1, 1, 0]),
+                )
+                target_material_line.set_color(orient_color)
+                target_material_plane.set_color(disable_color)
+
+            if plan_idx > 20:
+                plan_idx = 0
+
+            if plan_idx == 0:
+                print("Constrained: Reset")
+                target_material_line.set_color(disable_color)
+                target_material_plane.set_color(disable_color)
+
+                pose_cost_metric = None
+
+            plan_config.pose_cost_metric = pose_cost_metric
+
+            # motion generation:
+            for ks in range(len(target_material_list)):
+                if ks == target_idx:
+                    target_material_list[ks].set_color(np.ravel([0, 1.0, 0]))
+                else:
+                    target_material_list[ks].set_color(np.ravel([0.1, 0.1, 0.1]))
+
+            sim_js = robot.get_joints_state()
+            sim_js_names = robot.dof_names
+            cu_js = JointState(
+                position=tensor_args.to_device(sim_js.positions),
+                velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+                acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+                jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+                joint_names=sim_js_names,
+            )
+            cu_js = cu_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+
+            cube_position, cube_orientation = target_list[target_idx].get_world_pose()
+
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+            result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
+            # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+            succ = result.success.item()  # ik_result.success.item()
+            plan_idx += 1
+            if succ:
+                cmd_plan = result.get_interpolated_plan()
+                cmd_plan = motion_gen.get_full_js(cmd_plan)
+                # get only joint names that are in both:
+                idx_list = []
+                common_js_names = []
+                for x in sim_js_names:
+                    if x in cmd_plan.joint_names:
+                        idx_list.append(robot.get_dof_index(x))
+                        common_js_names.append(x)
+                # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+                cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+                target_idx += 1
+                if target_idx >= len(target_list):
+                    target_idx = 0
+
+            else:
+                carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
+        if cmd_plan is not None:
+            cmd_state = cmd_plan[cmd_idx]
+
+            # get full dof state
+            art_action = ArticulationAction(
+                cmd_state.position.cpu().numpy(),
+                cmd_state.velocity.cpu().numpy(),
+                joint_indices=idx_list,
+            )
+            # set desired joint angles obtained from IK:
+            articulation_controller.apply_action(art_action)
+            cmd_step_idx += 1
+            if cmd_step_idx == 2:
+                cmd_idx += 1
+                cmd_step_idx = 0
+            # for _ in range(2):
+            #    my_world.step(render=False)
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+    print("finished program")
+
+    simulation_app.close()
diff --git a/examples/isaac_sim/motion_gen_reacher.py b/examples/isaac_sim/motion_gen_reacher.py
index 95d4749..6461fd7 100644
--- a/examples/isaac_sim/motion_gen_reacher.py
+++ b/examples/isaac_sim/motion_gen_reacher.py
@@ -51,6 +51,32 @@ parser.add_argument(
     help="When True, runs in reactive mode",
     default=False,
 )
+
+parser.add_argument(
+    "--constrain_grasp_approach",
+    action="store_true",
+    help="When True, approaches grasp with fixed orientation and motion only along z axis.",
+    default=False,
+)
+
+parser.add_argument(
+    "--reach_partial_pose",
+    nargs=6,
+    metavar=("qx", "qy", "qz", "x", "y", "z"),
+    help="Reach partial pose",
+    type=float,
+    default=None,
+)
+parser.add_argument(
+    "--hold_partial_pose",
+    nargs=6,
+    metavar=("qx", "qy", "qz", "x", "y", "z"),
+    help="Hold partial pose while moving to goal",
+    type=float,
+    default=None,
+)
+
+
 args = parser.parse_args()
 
 ############################################################
@@ -97,7 +123,12 @@ from curobo.util_file import (
     join_path,
     load_yaml,
 )
-from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+from curobo.wrap.reacher.motion_gen import (
+    MotionGen,
+    MotionGenConfig,
+    MotionGenPlanConfig,
+    PoseCostMetric,
+)
 
 ############################################################
 
@@ -107,7 +138,7 @@ from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGen
 
 def main():
     # create a curobo motion gen instance:
-
+    num_targets = 0
     # assuming obstacles are in objects_path:
     my_world = World(stage_units_in_meters=1.0)
     stage = my_world.stage
@@ -152,12 +183,12 @@ def main():
 
     robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
 
-    articulation_controller = robot.get_articulation_controller()
+    articulation_controller = None
 
     world_cfg_table = WorldConfig.from_dict(
         load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
     )
-    world_cfg_table.cuboid[0].pose[2] -= 0.04
+    world_cfg_table.cuboid[0].pose[2] -= 0.02
     world_cfg1 = WorldConfig.from_dict(
         load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
     ).get_mesh_world()
@@ -171,12 +202,14 @@ def main():
     trajopt_tsteps = 32
     trim_steps = None
     max_attempts = 4
+    interpolation_dt = 0.05
     if args.reactive:
-        trajopt_tsteps = 36
+        trajopt_tsteps = 40
         trajopt_dt = 0.05
         optimize_dt = False
-        max_attemtps = 1
+        max_attempts = 1
         trim_steps = [1, None]
+        interpolation_dt = trajopt_dt
     motion_gen_config = MotionGenConfig.load_from_robot_config(
         robot_cfg,
         world_cfg,
@@ -184,12 +217,15 @@ def main():
         collision_checker_type=CollisionCheckerType.MESH,
         num_trajopt_seeds=12,
         num_graph_seeds=12,
-        interpolation_dt=0.05,
+        interpolation_dt=interpolation_dt,
         collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
         optimize_dt=optimize_dt,
         trajopt_dt=trajopt_dt,
         trajopt_tsteps=trajopt_tsteps,
         trim_steps=trim_steps,
+        # velocity_scale=0.1,
+        # self_collision_check=False,
+        # self_collision_opt=False,
     )
     motion_gen = MotionGen(motion_gen_config)
     print("warming up...")
@@ -216,6 +252,9 @@ def main():
     i = 0
     spheres = None
     past_cmd = None
+    target_orientation = None
+    past_orientation = None
+    pose_metric = None
     while simulation_app.is_running():
         my_world.step(render=True)
         if not my_world.is_playing():
@@ -228,6 +267,9 @@ def main():
 
         step_index = my_world.current_time_step_index
         # print(step_index)
+        if articulation_controller is None:
+            # robot.initialize()
+            articulation_controller = robot.get_articulation_controller()
         if step_index < 2:
             my_world.reset()
             robot._articulation_view.initialize()
@@ -265,6 +307,10 @@ def main():
             past_pose = cube_position
         if target_pose is None:
             target_pose = cube_position
+        if target_orientation is None:
+            target_orientation = cube_orientation
+        if past_orientation is None:
+            past_orientation = cube_orientation
 
         sim_js = robot.get_joints_state()
         sim_js_names = robot.dof_names
@@ -313,8 +359,12 @@ def main():
         if (np.max(np.abs(sim_js.velocities)) < 0.2) or args.reactive:
             robot_static = True
         if (
-            np.linalg.norm(cube_position - target_pose) > 1e-3
+            (
+                np.linalg.norm(cube_position - target_pose) > 1e-3
+                or np.linalg.norm(cube_orientation - target_orientation) > 1e-3
+            )
             and np.linalg.norm(past_pose - cube_position) == 0.0
+            and np.linalg.norm(past_orientation - cube_orientation) == 0.0
             and robot_static
         ):
             # Set EE teleop goals, use cube for simple non-vr init:
@@ -326,12 +376,24 @@ def main():
                 position=tensor_args.to_device(ee_translation_goal),
                 quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
             )
-
+            plan_config.pose_cost_metric = pose_metric
             result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
             # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
 
             succ = result.success.item()  # ik_result.success.item()
+            if num_targets == 1:
+                if args.constrain_grasp_approach:
+                    pose_metric = PoseCostMetric.create_grasp_approach_metric()
+                if args.reach_partial_pose is not None:
+                    reach_vec = motion_gen.tensor_args.to_device(args.reach_partial_pose)
+                    pose_metric = PoseCostMetric(
+                        reach_partial_pose=True, reach_vec_weight=reach_vec
+                    )
+                if args.hold_partial_pose is not None:
+                    hold_vec = motion_gen.tensor_args.to_device(args.hold_partial_pose)
+                    pose_metric = PoseCostMetric(hold_partial_pose=True, hold_vec_weight=hold_vec)
             if succ:
+                num_targets += 1
                 cmd_plan = result.get_interpolated_plan()
                 cmd_plan = motion_gen.get_full_js(cmd_plan)
                 # get only joint names that are in both:
@@ -350,7 +412,9 @@ def main():
             else:
                 carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
             target_pose = cube_position
+            target_orientation = cube_orientation
         past_pose = cube_position
+        past_orientation = cube_orientation
         if cmd_plan is not None:
             cmd_state = cmd_plan[cmd_idx]
             past_cmd = cmd_state.clone()
diff --git a/examples/isaac_sim/simple_stacking.py b/examples/isaac_sim/simple_stacking.py
index a72bf83..5d3422f 100644
--- a/examples/isaac_sim/simple_stacking.py
+++ b/examples/isaac_sim/simple_stacking.py
@@ -34,6 +34,13 @@ parser.add_argument(
     default=None,
     help="To run headless, use one of [native, websocket], webrtc might not work.",
 )
+
+parser.add_argument(
+    "--constrain_grasp_approach",
+    action="store_true",
+    help="When True, approaches grasp with fixed orientation and motion only along z axis.",
+    default=False,
+)
 args = parser.parse_args()
 
 # Third Party
@@ -78,6 +85,7 @@ from curobo.wrap.reacher.motion_gen import (
     MotionGenConfig,
     MotionGenPlanConfig,
     MotionGenResult,
+    PoseCostMetric,
 )
 
 
@@ -87,6 +95,7 @@ class CuroboController(BaseController):
         my_world: World,
         my_task: BaseStacking,
         name: str = "curobo_controller",
+        constrain_grasp_approach: bool = False,
     ) -> None:
         BaseController.__init__(self, name=name)
         self._save_log = False
@@ -145,13 +154,16 @@ class CuroboController(BaseController):
             collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
             store_ik_debug=self._save_log,
             store_trajopt_debug=self._save_log,
-            state_finite_difference_mode="CENTRAL",
-            acceleration_scale=0.5,
-            fixed_iters_trajopt=True,
+            velocity_scale=0.75,
         )
         self.motion_gen = MotionGen(motion_gen_config)
         print("warming up...")
-        self.motion_gen.warmup()
+        self.motion_gen.warmup(parallel_finetune=True)
+        pose_metric = None
+        if constrain_grasp_approach:
+            pose_metric = PoseCostMetric.create_grasp_approach_metric(
+                offset_position=0.1, tstep_fraction=0.6
+            )
 
         self.plan_config = MotionGenPlanConfig(
             enable_graph=False,
@@ -159,6 +171,8 @@ class CuroboController(BaseController):
             enable_graph_attempt=None,
             enable_finetune_trajopt=True,
             partial_ik_opt=False,
+            parallel_finetune=True,
+            pose_cost_metric=pose_metric,
         )
         self.usd_help.load_stage(self.my_world.stage)
         self.cmd_plan = None
@@ -185,7 +199,7 @@ class CuroboController(BaseController):
             cu_js,
             [cube_name],
             sphere_fit_type=SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
-            world_objects_pose_offset=Pose.from_list([0, 0, 0.005, 1, 0, 0, 0], self.tensor_args),
+            world_objects_pose_offset=Pose.from_list([0, 0, 0.01, 1, 0, 0, 0], self.tensor_args),
         )
 
     def detach_obj(self) -> None:
@@ -259,7 +273,7 @@ class CuroboController(BaseController):
             # get full dof state
             art_action = ArticulationAction(
                 cmd_state.position.cpu().numpy(),
-                # cmd_state.velocity.cpu().numpy(),
+                cmd_state.velocity.cpu().numpy() * 0.0,
                 joint_indices=self.idx_list,
             )
             if self.cmd_idx >= len(self.cmd_plan.position):
@@ -417,7 +431,9 @@ my_world.add_task(my_task)
 my_world.reset()
 robot_name = my_task.get_params()["robot_name"]["value"]
 my_franka = my_world.scene.get_object(robot_name)
-my_controller = CuroboController(my_world=my_world, my_task=my_task)
+my_controller = CuroboController(
+    my_world=my_world, my_task=my_task, constrain_grasp_approach=args.constrain_grasp_approach
+)
 articulation_controller = my_franka.get_articulation_controller()
 set_camera_view(eye=[2, 0, 1], target=[0.00, 0.00, 0.00], camera_prim_path="/OmniverseKit_Persp")
 wait_steps = 8
@@ -439,17 +455,17 @@ print(articulation_controller.get_gains())
 print(articulation_controller.get_max_efforts())
 robot = my_franka
 print("**********************")
-robot.enable_gravity()
-articulation_controller.set_gains(
-    kps=np.array(
-        [100000000, 6000000.0, 10000000, 600000.0, 25000.0, 15000.0, 50000.0, 6000.0, 6000.0]
+if True:
+    robot.enable_gravity()
+    articulation_controller.set_gains(
+        kps=np.array(
+            [100000000, 6000000.0, 10000000, 600000.0, 25000.0, 15000.0, 50000.0, 6000.0, 6000.0]
+        )
     )
-)
 
-
-articulation_controller.set_max_efforts(
-    values=np.array([100000, 52.199997, 100000, 52.199997, 7.2, 7.2, 7.2, 50.0, 50])
-)
+    articulation_controller.set_max_efforts(
+        values=np.array([100000, 52.199997, 100000, 52.199997, 7.2, 7.2, 7.2, 50.0, 50])
+    )
 
 print("Updated gains:")
 print(articulation_controller.get_gains())
diff --git a/examples/kinematics_example.py b/examples/kinematics_example.py
index a32827a..9b83829 100644
--- a/examples/kinematics_example.py
+++ b/examples/kinematics_example.py
@@ -15,7 +15,7 @@
 import torch
 
 # CuRobo
-from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
 from curobo.types.base import TensorDeviceType
 from curobo.types.robot import RobotConfig
 from curobo.util.logger import setup_curobo_logger
diff --git a/examples/motion_gen_example.py b/examples/motion_gen_example.py
index 789f33b..820d8c1 100644
--- a/examples/motion_gen_example.py
+++ b/examples/motion_gen_example.py
@@ -42,8 +42,9 @@ def plot_traj(trajectory, dt):
         axs[3].plot(timesteps, qddd[:, i], label=str(i))
 
     plt.legend()
-    # plt.savefig("test.png")
-    plt.show()
+    plt.savefig("test.png")
+    plt.close()
+    # plt.show()
 
 
 def plot_iters_traj(trajectory, d_id=1, dof=7, seed=0):
@@ -140,37 +141,54 @@ def demo_motion_gen(js=False):
         world_file,
         tensor_args,
         interpolation_dt=0.01,
+        # trajopt_dt=0.15,
+        # velocity_scale=0.1,
+        use_cuda_graph=True,
+        # finetune_dt_scale=2.5,
+        interpolation_steps=10000,
     )
 
     motion_gen = MotionGen(motion_gen_config)
+    motion_gen.warmup(parallel_finetune=True)
 
-    motion_gen.warmup(enable_graph=True, warmup_js_trajopt=js)
-    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
-    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    # motion_gen.warmup(enable_graph=True, warmup_js_trajopt=js, parallel_finetune=True)
+    # robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    # robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
     retract_cfg = motion_gen.get_retract_config()
     state = motion_gen.rollout_fn.compute_kinematics(
         JointState.from_position(retract_cfg.view(1, -1))
     )
 
     retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
-    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.1)
+    start_state = JointState.from_position(retract_cfg.view(1, -1))
     goal_state = start_state.clone()
-    goal_state.position[..., 3] -= 0.1
+
+    start_state.position[0, 0] += 0.25
+    # goal_state.position[0,0] += 0.5
     if js:
         result = motion_gen.plan_single_js(
             start_state,
             goal_state,
-            MotionGenPlanConfig(
-                max_attempts=1, enable_graph=False, enable_opt=True, enable_finetune_trajopt=True
-            ),
+            MotionGenPlanConfig(max_attempts=1, parallel_finetune=True),
         )
     else:
         result = motion_gen.plan_single(
-            start_state, retract_pose, MotionGenPlanConfig(max_attempts=1)
+            start_state,
+            retract_pose,
+            MotionGenPlanConfig(
+                max_attempts=1, parallel_finetune=True, enable_finetune_trajopt=True
+            ),
         )
-    traj = result.get_interpolated_plan()
-    print("Trajectory Generated: ", result.success, result.solve_time, result.status)
+    print(
+        "Trajectory Generated: ",
+        result.success,
+        result.solve_time,
+        result.status,
+        result.optimized_dt,
+    )
     if PLOT and result.success.item():
+        traj = result.get_interpolated_plan()
+
         plot_traj(traj, result.interpolation_dt)
         # plt.save("test.png")
         # plt.close()
@@ -261,6 +279,45 @@ def demo_motion_gen_batch():
         plot_traj(traj[1, : result.path_buffer_last_tstep[1], :].cpu().numpy())
 
 
+def demo_motion_gen_goalset():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=True,
+        num_trajopt_seeds=12,
+        num_graph_seeds=1,
+        num_ik_seeds=30,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.6)
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(2, 1).view(1, -1, 3),
+        quaternion=state.ee_quat_seq.repeat(2, 1).view(1, -1, 4),
+    )
+    goal_pose.position[0, 0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen.plan_goalset(start_state, goal_pose, m_config)
+
+
 def demo_motion_gen_api():
     tensor_args = TensorDeviceType(device=torch.device("cuda:0"))
     interpolation_dt = 0.02
@@ -373,8 +430,9 @@ def demo_motion_gen_batch_env(n_envs: int = 10):
 
 if __name__ == "__main__":
     setup_curobo_logger("error")
-    # demo_motion_gen(js=True)
-    demo_motion_gen_batch()
+    demo_motion_gen(js=False)
+    # demo_motion_gen_batch()
+    # demo_motion_gen_goalset()
     # n = [2, 10]
     # for n_envs in n:
     #    demo_motion_gen_batch_env(n_envs=n_envs)
diff --git a/examples/robot_image_segmentation_example.py b/examples/robot_image_segmentation_example.py
new file mode 100644
index 0000000..d98fcdb
--- /dev/null
+++ b/examples/robot_image_segmentation_example.py
@@ -0,0 +1,238 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+""" This example shows how to use cuRobo's kinematics to generate a mask. """
+
+
+# Standard Library
+import time
+
+# Third Party
+import imageio
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+from nvblox_torch.datasets.mesh_dataset import MeshDataset
+from torch.profiler import ProfilerActivity, profile, record_function
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.geom.types import PointCloud, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_segmenter import RobotSegmenter
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+torch.manual_seed(30)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def create_render_dataset(robot_file, save_debug_data: bool = False):
+    # load robot:
+    robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file))
+    robot_dict["robot_cfg"]["kinematics"]["load_link_names_with_mesh"] = True
+    robot_dict["robot_cfg"]["kinematics"]["load_meshes"] = True
+
+    robot_cfg = RobotConfig.from_dict(robot_dict["robot_cfg"])
+
+    kin_model = CudaRobotModel(robot_cfg.kinematics)
+
+    q = kin_model.retract_config
+
+    meshes = kin_model.get_robot_as_mesh(q)
+
+    world = WorldConfig(mesh=meshes[:])
+    world_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_test.yml"))
+    )
+    world_table.cuboid[0].dims = [0.5, 0.5, 0.1]
+    world.add_obstacle(world_table.objects[0])
+    world.add_obstacle(world_table.objects[1])
+    if save_debug_data:
+        world.save_world_as_mesh("scene.stl", process_color=False)
+    robot_mesh = (
+        WorldConfig.create_merged_mesh_world(world, process_color=False).mesh[0].get_trimesh_mesh()
+    )
+
+    mesh_dataset = MeshDataset(
+        None,
+        n_frames=20,
+        image_size=480,
+        save_data_dir=None,
+        trimesh_mesh=robot_mesh,
+    )
+    q_js = JointState(position=q, joint_names=kin_model.joint_names)
+
+    return mesh_dataset, q_js
+
+
+def mask_image(robot_file="ur5e.yml"):
+    save_debug_data = False
+    # create robot segmenter:
+    tensor_args = TensorDeviceType()
+
+    curobo_segmenter = RobotSegmenter.from_robot_file(
+        robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True
+    )
+
+    mesh_dataset, q_js = create_render_dataset(robot_file, save_debug_data)
+
+    if save_debug_data:
+        visualize_scale = 10.0
+        data = mesh_dataset[0]
+        cam_obs = CameraObservation(
+            depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+            intrinsics=data["intrinsics"],
+            pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+        )
+        # save depth image
+        imageio.imwrite(
+            "camera_depth.png",
+            (cam_obs.depth_image * visualize_scale)
+            .squeeze()
+            .detach()
+            .cpu()
+            .numpy()
+            .astype(np.uint16),
+        )
+
+        # save robot spheres in current joint configuration
+        robot_kinematics = curobo_segmenter._robot_world.kinematics
+        sph = robot_kinematics.get_robot_as_spheres(q_js.position)
+        WorldConfig(sphere=sph[0]).save_world_as_mesh("robot_spheres.stl")
+
+        # save world pointcloud in robot origin
+
+        pc = cam_obs.get_pointcloud()
+        pc_obs = PointCloud("world", pose=cam_obs.pose.to_list(), points=pc)
+        pc_obs.save_as_mesh("camera_pointcloud.stl", transform_with_pose=True)
+
+        # run segmentation:
+        depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(
+            cam_obs,
+            q_js,
+        )
+        # save robot points as mesh
+
+        robot_mask = cam_obs.clone()
+        robot_mask.depth_image[~depth_mask] = 0.0
+        robot_mesh = PointCloud(
+            "world", pose=robot_mask.pose.to_list(), points=robot_mask.get_pointcloud()
+        )
+        robot_mesh.save_as_mesh("robot_segmented.stl", transform_with_pose=True)
+        # save depth image
+        imageio.imwrite(
+            "robot_depth.png",
+            (robot_mask.depth_image * visualize_scale)
+            .detach()
+            .squeeze()
+            .cpu()
+            .numpy()
+            .astype(np.uint16),
+        )
+
+        # save world points as mesh
+
+        world_mask = cam_obs.clone()
+        world_mask.depth_image[depth_mask] = 0.0
+        world_mesh = PointCloud(
+            "world", pose=world_mask.pose.to_list(), points=world_mask.get_pointcloud()
+        )
+        world_mesh.save_as_mesh("world_segmented.stl", transform_with_pose=True)
+
+        imageio.imwrite(
+            "world_depth.png",
+            (world_mask.depth_image * visualize_scale)
+            .detach()
+            .squeeze()
+            .cpu()
+            .numpy()
+            .astype(np.uint16),
+        )
+
+    dt_list = []
+
+    for i in range(len(mesh_dataset)):
+
+        data = mesh_dataset[i]
+        cam_obs = CameraObservation(
+            depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+            intrinsics=data["intrinsics"],
+            pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+        )
+        if not curobo_segmenter.ready:
+            curobo_segmenter.update_camera_projection(cam_obs)
+        st_time = time.time()
+
+        depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(
+            cam_obs,
+            q_js,
+        )
+
+        torch.cuda.synchronize()
+        dt_list.append(time.time() - st_time)
+
+    print("Segmentation Time (ms), (hz)", np.mean(dt_list[5:]) * 1000.0, 1.0 / np.mean(dt_list[5:]))
+
+
+def profile_mask_image(robot_file="ur5e.yml"):
+    # create robot segmenter:
+    tensor_args = TensorDeviceType()
+
+    curobo_segmenter = RobotSegmenter.from_robot_file(
+        robot_file, collision_sphere_buffer=0.0, distance_threshold=0.05, use_cuda_graph=False
+    )
+
+    mesh_dataset, q_js = create_render_dataset(robot_file)
+
+    dt_list = []
+    data = mesh_dataset[0]
+    cam_obs = CameraObservation(
+        depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+        intrinsics=data["intrinsics"],
+        pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+    )
+    if not curobo_segmenter.ready:
+        curobo_segmenter.update_camera_projection(cam_obs)
+    depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(cam_obs, q_js)
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+
+        for i in range(len(mesh_dataset)):
+            with profiler.record_function("get_data"):
+
+                data = mesh_dataset[i]
+                cam_obs = CameraObservation(
+                    depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+                    intrinsics=data["intrinsics"],
+                    pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+                )
+            st_time = time.time()
+            with profiler.record_function("segmentation"):
+
+                depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(
+                    cam_obs, q_js
+                )
+
+    print("Exporting the trace..")
+    prof.export_chrome_trace("segmentation.json")
+
+
+if __name__ == "__main__":
+    mask_image()
+    # profile_mask_image()
diff --git a/setup.py b/setup.py
index bd6a1d7..c875560 100644
--- a/setup.py
+++ b/setup.py
@@ -87,4 +87,6 @@ ext_modules = [
 setuptools.setup(
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},
+    package_data={"": ["*.so"]},
+    include_package_data=True,
 )
diff --git a/src/curobo/__init__.py b/src/curobo/__init__.py
index 58d5ff0..d56871c 100644
--- a/src/curobo/__init__.py
+++ b/src/curobo/__init__.py
@@ -9,7 +9,22 @@
 # its affiliates is strictly prohibited.
 #
 
-"""CuRobo package."""
+"""
+cuRobo package is split into several modules:
+
+- :mod:`curobo.opt` contains optimization solvers.
+- :mod:`curobo.cuda_robot_model` contains robot kinematics.
+- :mod:`curobo.curobolib` contains the cuda kernels and python bindings for them.
+- :mod:`curobo.geom` contains geometry processing, collision checking and frame transforms.
+- :mod:`curobo.graph` contains geometric planning with graph search methods.
+- :mod:`curobo.rollout` contains methods that map actions to costs. This class wraps instances of
+  :mod:`curobo.cuda_robot_model` and :mod:`geom` to compute costs given trajectory of actions.
+- :mod:`curobo.util` contains utility methods.
+- :mod:`curobo.wrap` adds the user-level api for task programming. Includes implementation of 
+  collision-free reacher and batched robot world collision checking.
+- :mod:`curobo.types` contains custom dataclasses for common data types in robotics, including 
+  :meth:`types.state.JointState`, :meth:`types.camera.CameraObservation`, :meth:`types.math.Pose`.
+"""
 
 
 # NOTE (roflaherty): This is inspired by how matplotlib does creates its version value.
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
index 5fc618d..1821c7a 100644
--- a/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
@@ -172,7 +172,7 @@
     <dynamics damping="10.0"/>
     <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
   </joint>
-  <!--
+  
   <link name="panda_link8"/>
   <joint name="panda_joint8" type="fixed">
     <origin rpy="0 0 0" xyz="0 0 0.107"/>
@@ -180,18 +180,11 @@
     <child link="panda_link8"/>
     <axis xyz="0 0 0"/>
   </joint>
-  Removing this joint seems to help with some stability things
-  -->
   <joint name="panda_hand_joint" type="fixed">
-    <!--
     <parent link="panda_link8"/>
-    -->
-    <parent link="panda_link7"/>
     <child link="panda_hand"/>
-    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.107"/>
-    <!--
+    
     <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
-    -->
   </joint>
   <link name="panda_hand">
     <visual>
diff --git a/src/curobo/content/configs/robot/franka.yml b/src/curobo/content/configs/robot/franka.yml
index 07ae276..bc3f013 100644
--- a/src/curobo/content/configs/robot/franka.yml
+++ b/src/curobo/content/configs/robot/franka.yml
@@ -13,7 +13,7 @@ robot_cfg:
   kinematics:
     use_usd_kinematics: False
     isaac_usd_path: "/Isaac/Robots/Franka/franka.usd"
-    usd_path: "robot/non_shipping/franka/franka_panda_meters.usda"
+    usd_path: "robot/franka/franka_panda.usda"
     usd_robot_root: "/panda"
     usd_flip_joints: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
       "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
@@ -32,7 +32,7 @@ robot_cfg:
     usd_flip_joint_limits: ["panda_finger_joint2"]
     urdf_path: "robot/franka_description/franka_panda.urdf"
     asset_root_path: "robot/franka_description"
-    base_link: "base_link"
+    base_link: "panda_link0"
     ee_link: "panda_hand"
     collision_link_names:
       [
@@ -50,7 +50,7 @@ robot_cfg:
         "attached_object",
       ]
     collision_spheres: "spheres/franka_mesh.yml"
-    collision_sphere_buffer: 0.0025 
+    collision_sphere_buffer: 0.004 # 0.0025
     extra_collision_spheres: {"attached_object": 4}
     use_global_cumul: True
     self_collision_ignore:
diff --git a/src/curobo/content/configs/robot/franka_mobile.yml b/src/curobo/content/configs/robot/franka_mobile.yml
index be13b24..62a4f52 100644
--- a/src/curobo/content/configs/robot/franka_mobile.yml
+++ b/src/curobo/content/configs/robot/franka_mobile.yml
@@ -109,7 +109,7 @@ robot_cfg:
         "base_x", "base_y", "base_z",
         "panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
       "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
-      retract_config: [0,0,0,0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, 0.04]
+      retract_config: [0,0,0,0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, 0.04]
       null_space_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
       cspace_distance_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
       max_acceleration: 15.0 #15.0
diff --git a/src/curobo/content/configs/robot/tm12.yml b/src/curobo/content/configs/robot/tm12.yml
index 01db09c..4d0cf78 100644
--- a/src/curobo/content/configs/robot/tm12.yml
+++ b/src/curobo/content/configs/robot/tm12.yml
@@ -12,7 +12,7 @@
 robot_cfg:
   kinematics:
     use_usd_kinematics: False
-    usd_path: "robot/non_shipping/techman/tm12.usd"
+    usd_path: "robot/techman/tm12.usd"
     usd_robot_root: "/tm12"
     usd_flip_joints:
       {
diff --git a/src/curobo/content/configs/robot/ur5e.yml b/src/curobo/content/configs/robot/ur5e.yml
index e2b2be3..689bd9a 100644
--- a/src/curobo/content/configs/robot/ur5e.yml
+++ b/src/curobo/content/configs/robot/ur5e.yml
@@ -49,7 +49,7 @@ robot_cfg:
         - "center": [-0.055, 0.008, 0.14]
           "radius": 0.07
         - "center": [-0.001, -0.002, 0.143]
-          "radius": 0.076
+          "radius": 0.08
       forearm_link:
         - "center": [-0.01, 0.002, 0.031]
           "radius": 0.072
diff --git a/src/curobo/content/configs/task/finetune_trajopt.yml b/src/curobo/content/configs/task/finetune_trajopt.yml
index 3d13330..467d86b 100644
--- a/src/curobo/content/configs/task/finetune_trajopt.yml
+++ b/src/curobo/content/configs/task/finetune_trajopt.yml
@@ -31,22 +31,23 @@ model:
 cost:
   pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
-    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
-    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
+    weight: [2000,500000.0,30,50] 
     vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
     terminal: True
-    run_weight: 0.0 #0.05
+    run_weight: 1.0 
     use_metric: True
 
   link_pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
-    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
     weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
     vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
     terminal: True
-    run_weight: 0.0
+    run_weight: 0.001
     use_metric: True
 
+  
   cspace_cfg:
     weight: 10000.0
     terminal: True
@@ -54,7 +55,7 @@ cost:
 
   bound_cfg:
     weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values
-    smooth_weight: [0.0,2000.0, 50.0, 0.0]  # [vel, acc, jerk,] 
+    smooth_weight: [0.0,5000.0, 50.0, 0.0]  # [vel, acc, jerk,] 
     run_weight_velocity: 0.0
     run_weight_acceleration: 1.0
     run_weight_jerk: 1.0
@@ -78,11 +79,11 @@ cost:
 
  
 lbfgs:
-  n_iters: 400 
+  n_iters: 400 # 400 
   inner_iters: 25 
-  cold_start_n_iters: 200 
+  cold_start_n_iters: null 
   min_iters: 25
-  line_search_scale: [0.01, 0.3,0.7,1.0]   #   #
+  line_search_scale: [0.01, 0.3, 0.7,1.0]   #   #
   fixed_iters: True
   cost_convergence: 0.01
   cost_delta_threshold: 1.0
@@ -90,7 +91,7 @@ lbfgs:
   epsilon: 0.01
   history: 15 #15
   use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
   store_debug: False
   use_cuda_kernel: True
   stable_mode: True
diff --git a/src/curobo/content/configs/task/finetune_trajopt_slow.yml b/src/curobo/content/configs/task/finetune_trajopt_slow.yml
new file mode 100644
index 0000000..35f1a35
--- /dev/null
+++ b/src/curobo/content/configs/task/finetune_trajopt_slow.yml
@@ -0,0 +1,109 @@
+
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 32
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 1.0
+    enable: False
+  dt_traj_params:
+    base_dt: 0.2
+    base_ratio: 1.0
+    max_dt: 0.2
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  state_finite_difference_mode: "CENTRAL" 
+
+  teleport_mode: False
+  return_full_act_buffer: True
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+    terminal: True
+    run_weight: 0.0 #0.05
+    use_metric: True
+
+  link_pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+    terminal: True
+    run_weight: 0.001
+    use_metric: True
+
+  
+  cspace_cfg:
+    weight: 10000.0
+    terminal: True
+    run_weight: 0.00 #1
+
+  bound_cfg:
+    weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values
+    
+    smooth_weight: [0.0,50000.0, 500.0, 0.0]  # [vel, acc, jerk,] 
+    
+    run_weight_velocity: 0.0
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+    activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk
+    null_space_weight: [0.0]
+
+  primitive_collision_cfg:
+    weight: 1000000.0
+    use_sweep: True
+    sweep_steps: 6
+    classify: False
+    use_sweep_kernel: True
+    use_speed_metric: True
+    speed_dt: 0.01 # used only for speed metric
+    activation_distance: 0.025
+
+  self_collision_cfg:
+    weight: 5000.0
+    classify: False
+  
+
+ 
+lbfgs:
+  n_iters: 400 # 400 
+  inner_iters: 25 
+  cold_start_n_iters: 200 
+  min_iters: 25
+  line_search_scale: [0.01, 0.3, 0.7,1.0]   #   #
+  fixed_iters: True
+  cost_convergence: 0.01
+  cost_delta_threshold: 1.0
+  cost_relative_threshold: 0.999 #0.999
+  epsilon: 0.01
+  history: 15 #15
+  use_cuda_graph: True
+  n_problems: 1
+  store_debug: False
+  use_cuda_kernel: True
+  stable_mode: True
+  line_search_type: "approx_wolfe"
+  use_cuda_line_search_kernel: True
+  use_cuda_update_best_kernel: True
+  use_temporal_smooth: False
+  sync_cuda_time: True
+  step_scale: 1.0
+  last_best: 10
+  use_coo_sparse: True
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
diff --git a/src/curobo/content/configs/task/gradient_ik.yml b/src/curobo/content/configs/task/gradient_ik.yml
index 73eaf05..97e17b6 100644
--- a/src/curobo/content/configs/task/gradient_ik.yml
+++ b/src/curobo/content/configs/task/gradient_ik.yml
@@ -32,6 +32,7 @@ cost:
     vec_convergence: [0.0, 0.00] 
     terminal: False
     use_metric: True
+    run_weight: 1.0
   link_pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     weight: [1000, 20000, 30, 50]
@@ -58,7 +59,7 @@ cost:
 lbfgs:
   n_iters: 100 #60
   inner_iters: 25
-  cold_start_n_iters: 100
+  cold_start_n_iters: null
   min_iters: 20
   line_search_scale:  [0.01, 0.3, 0.7,  1.0]
   fixed_iters: True
@@ -69,7 +70,7 @@ lbfgs:
   history: 6
   horizon: 1
   use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
   store_debug: False
   use_cuda_kernel: True
   stable_mode: True
diff --git a/src/curobo/content/configs/task/gradient_mpc.yml b/src/curobo/content/configs/task/gradient_mpc.yml
index 66b2093..4de461a 100644
--- a/src/curobo/content/configs/task/gradient_mpc.yml
+++ b/src/curobo/content/configs/task/gradient_mpc.yml
@@ -83,7 +83,7 @@ lbfgs:
   epsilon: 0.01
   history: 15 #15
   use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
   store_debug: False
   use_cuda_kernel: True
   stable_mode: True
diff --git a/src/curobo/content/configs/task/gradient_trajopt.yml b/src/curobo/content/configs/task/gradient_trajopt.yml
index a786659..471d186 100644
--- a/src/curobo/content/configs/task/gradient_trajopt.yml
+++ b/src/curobo/content/configs/task/gradient_trajopt.yml
@@ -31,11 +31,11 @@ model:
 cost:
   pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
-    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
     weight: [2000,50000.0,30,60] #[150.0, 2000.0, 30, 40]
     vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
     terminal: True
-    run_weight: 0.00
+    run_weight: 1.0
     use_metric: True
 
   link_pose_cfg:
@@ -44,7 +44,7 @@ cost:
     weight: [2000,50000.0,30,50] #[150.0, 2000.0, 30, 40]
     vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
     terminal: True
-    run_weight: 0.0
+    run_weight: 0.001
     use_metric: True
 
   cspace_cfg:
@@ -54,7 +54,9 @@ cost:
 
   bound_cfg:
     weight: [5000.0, 50000.0, 50000.0,50000.0] # needs to be 3 values
+    #weight: [000.0, 000.0, 000.0,000.0]
     smooth_weight: [0.0,10000.0,10.0, 0.0]  # [vel, acc, jerk, alpha_v-not-used]
+    #smooth_weight: [0.0,0000.0,0.0, 0.0]  # [vel, acc, jerk, alpha_v-not-used]
     run_weight_velocity: 0.00
     run_weight_acceleration: 1.0
     run_weight_jerk: 1.0
@@ -79,9 +81,9 @@ cost:
 
  
 lbfgs:
-  n_iters: 175 #175
+  n_iters: 125 #175
   inner_iters: 25 
-  cold_start_n_iters: 150 
+  cold_start_n_iters: null 
   min_iters: 25
   line_search_scale: [0.01,0.3,0.7,1.0]  #[0.01,0.2, 0.3,0.5,0.7,0.9, 1.0] #
   fixed_iters: True
@@ -91,7 +93,7 @@ lbfgs:
   epsilon: 0.01
   history: 15
   use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
   store_debug: False
   use_cuda_kernel: True
   stable_mode: True
diff --git a/src/curobo/content/configs/task/particle_ik.yml b/src/curobo/content/configs/task/particle_ik.yml
index 67b7ba7..903e549 100644
--- a/src/curobo/content/configs/task/particle_ik.yml
+++ b/src/curobo/content/configs/task/particle_ik.yml
@@ -31,6 +31,7 @@ cost:
     vec_convergence: [0.00, 0.000] # orientation, position
     terminal: False
     use_metric: True
+    run_weight: 1.0
   link_pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     weight: [30, 50, 10, 10] #[20.0, 100.0]
@@ -56,7 +57,7 @@ mppi:
   init_cov          : 1.0 #0.15 #.5 #.5
   gamma             : 1.0
   n_iters           : 4
-  cold_start_n_iters: 4
+  cold_start_n_iters: null
   step_size_mean    : 0.9
   step_size_cov     : 0.1
   beta              : 0.01
@@ -69,12 +70,12 @@ mppi:
   sample_mode       : 'BEST'
   base_action       : 'REPEAT'
   squash_fn         : 'CLAMP'
-  n_envs            : 1
+  n_problems            : 1
   use_cuda_graph    : True
   seed              : 0
   store_debug       : False
   random_mean       : False
-  sample_per_env    : True
+  sample_per_problem: True
   sync_cuda_time    : False
   use_coo_sparse    : True
   sample_params:
diff --git a/src/curobo/content/configs/task/particle_mpc.yml b/src/curobo/content/configs/task/particle_mpc.yml
index eec1adf..2b5b7e8 100644
--- a/src/curobo/content/configs/task/particle_mpc.yml
+++ b/src/curobo/content/configs/task/particle_mpc.yml
@@ -89,12 +89,12 @@ mppi:
   sample_mode       : 'BEST'
   base_action       : 'REPEAT'
   squash_fn         : 'CLAMP'
-  n_envs            : 1
+  n_problems        : 1
   use_cuda_graph    : True
   seed              : 0
   store_debug       : False
   random_mean       : True
-  sample_per_env    : False
+  sample_per_problem: False
   sync_cuda_time    : True
   use_coo_sparse    : True
   sample_params:
diff --git a/src/curobo/content/configs/task/particle_trajopt.yml b/src/curobo/content/configs/task/particle_trajopt.yml
index 4183c6b..1c76623 100644
--- a/src/curobo/content/configs/task/particle_trajopt.yml
+++ b/src/curobo/content/configs/task/particle_trajopt.yml
@@ -30,17 +30,17 @@ model:
 cost:
   pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
-    run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight 
+    run_vec_weight: [0.0,0.0,0.0,0.0,0.0,0.0] # running weight 
     weight: [250.0, 5000.0, 20, 20]
     vec_convergence: [0.0,0.0,1000.0,1000.0] 
     terminal: True
-    run_weight: 0.00 
+    run_weight: 1.0
     use_metric: True
   
   link_pose_cfg:
     vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
-    run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight 
-    weight: [250.0, 5000.0, 40, 40]
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
+    weight: [0.0, 5000.0, 40, 40]
     vec_convergence: [0.0,0.0,1000.0,1000.0] 
     terminal: True
     run_weight: 0.00 
@@ -54,10 +54,12 @@ cost:
   bound_cfg:
     weight: [0.1, 0.1,0.0,0.0]
     activation_distance: [0.0,0.0,0.0,0.0] #-#0.01
+    #smooth_weight: [0.0,100.0,1.0,0.0]
     smooth_weight: [0.0,20.0,0.0,0.0]
     run_weight_velocity: 0.0
     run_weight_acceleration: 1.0
     run_weight_jerk: 1.0
+    null_space_weight: [0.00]
 
   primitive_collision_cfg:
     weight: 5000.0
@@ -80,7 +82,7 @@ mppi:
   init_cov          : 0.5
   gamma             : 1.0
   n_iters           : 2
-  cold_start_n_iters: 2
+  cold_start_n_iters: null
   step_size_mean    : 0.9
   step_size_cov     : 0.01
   beta              : 0.01
@@ -93,12 +95,12 @@ mppi:
   sample_mode       : 'BEST'
   base_action       : 'REPEAT'
   squash_fn         : 'CLAMP'
-  n_envs            : 1
+  n_problems            : 1
   use_cuda_graph    : True
   seed              : 0
   store_debug       : False
   random_mean       : True
-  sample_per_env    : True
+  sample_per_problem: True
   sync_cuda_time    : False
   use_coo_sparse    : True
   sample_params:
diff --git a/src/curobo/content/configs/world/collision_table.yml b/src/curobo/content/configs/world/collision_table.yml
index 169a95f..526b550 100644
--- a/src/curobo/content/configs/world/collision_table.yml
+++ b/src/curobo/content/configs/world/collision_table.yml
@@ -11,4 +11,4 @@
 cuboid:
   table:
     dims: [5.0, 5.0, 0.2] # x, y, z
-    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw                                                                                                                                                                                                                                                            
\ No newline at end of file
diff --git a/src/curobo/cuda_robot_model/cuda_robot_generator.py b/src/curobo/cuda_robot_model/cuda_robot_generator.py
index d105348..60dbea8 100644
--- a/src/curobo/cuda_robot_model/cuda_robot_generator.py
+++ b/src/curobo/cuda_robot_model/cuda_robot_generator.py
@@ -145,6 +145,9 @@ class CudaRobotGeneratorConfig:
     #: cspace config
     cspace: Union[None, CSpaceConfig, Dict[str, List[Any]]] = None
 
+    #: Enable loading meshes from kinematics parser.
+    load_meshes: bool = False
+
     def __post_init__(self):
         # add root path:
         # Check if an external asset path is provided:
@@ -283,7 +286,10 @@ class CudaRobotGenerator(CudaRobotGeneratorConfig):
             )
         else:
             self._kinematics_parser = UrdfKinematicsParser(
-                self.urdf_path, mesh_root=self.asset_root_path, extra_links=self.extra_links
+                self.urdf_path,
+                mesh_root=self.asset_root_path,
+                extra_links=self.extra_links,
+                load_meshes=self.load_meshes,
             )
 
         if self.lock_joints is None:
diff --git a/src/curobo/cuda_robot_model/cuda_robot_model.py b/src/curobo/cuda_robot_model/cuda_robot_model.py
index 78f4370..6f5a45a 100644
--- a/src/curobo/cuda_robot_model/cuda_robot_model.py
+++ b/src/curobo/cuda_robot_model/cuda_robot_model.py
@@ -30,7 +30,7 @@ from curobo.cuda_robot_model.types import (
     SelfCollisionKinematicsConfig,
 )
 from curobo.curobolib.kinematics import get_cuda_kinematics
-from curobo.geom.types import Sphere
+from curobo.geom.types import Mesh, Sphere
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.util.logger import log_error
@@ -390,4 +390,8 @@ class CudaRobotModel(CudaRobotModelConfig):
         return self.kinematics_config.base_link
 
     def update_kinematics_config(self, new_kin_config: KinematicsTensorConfig):
-        self.kinematics_config.copy(new_kin_config)
+        self.kinematics_config.copy_(new_kin_config)
+
+    @property
+    def retract_config(self):
+        return self.kinematics_config.cspace.retract_config
diff --git a/src/curobo/cuda_robot_model/types.py b/src/curobo/cuda_robot_model/types.py
index a620457..9c782ad 100644
--- a/src/curobo/cuda_robot_model/types.py
+++ b/src/curobo/cuda_robot_model/types.py
@@ -23,7 +23,7 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.types.state import JointState
 from curobo.types.tensor import T_DOF
-from curobo.util.tensor_util import copy_if_not_none
+from curobo.util.tensor_util import clone_if_not_none, copy_if_not_none
 
 
 class JointType(Enum):
@@ -75,20 +75,28 @@ class JointLimits:
             self.tensor_args,
         )
 
+    def copy_(self, new_jl: JointLimits):
+        self.joint_names = new_jl.joint_names.copy()
+        self.position.copy_(new_jl.position)
+        self.velocity.copy_(new_jl.velocity)
+        self.acceleration.copy_(new_jl.acceleration)
+        self.effort = copy_if_not_none(new_jl.effort, self.effort)
+        return self
+
 
 @dataclass
 class CSpaceConfig:
     joint_names: List[str]
     retract_config: Optional[T_DOF] = None
     cspace_distance_weight: Optional[T_DOF] = None
-    null_space_weight: Optional[T_DOF] = None  # List[float]
+    null_space_weight: Optional[T_DOF] = None
     tensor_args: TensorDeviceType = TensorDeviceType()
     max_acceleration: Union[float, List[float]] = 10.0
     max_jerk: Union[float, List[float]] = 500.0
     velocity_scale: Union[float, List[float]] = 1.0
     acceleration_scale: Union[float, List[float]] = 1.0
     jerk_scale: Union[float, List[float]] = 1.0
-    position_limit_clip: Union[float, List[float]] = 0.01
+    position_limit_clip: Union[float, List[float]] = 0.0
 
     def __post_init__(self):
         if self.retract_config is not None:
@@ -147,12 +155,31 @@ class CSpaceConfig:
         joint_names = [self.joint_names[n] for n in new_index]
         self.joint_names = joint_names
 
+    def copy_(self, new_config: CSpaceConfig):
+        self.joint_names = new_config.joint_names.copy()
+        self.retract_config = copy_if_not_none(new_config.retract_config, self.retract_config)
+        self.null_space_weight = copy_if_not_none(
+            new_config.null_space_weight, self.null_space_weight
+        )
+        self.cspace_distance_weight = copy_if_not_none(
+            new_config.cspace_distance_weight, self.cspace_distance_weight
+        )
+        self.tensor_args = self.tensor_args
+        self.max_jerk = copy_if_not_none(new_config.max_jerk, self.max_jerk)
+        self.max_acceleration = copy_if_not_none(new_config.max_acceleration, self.max_acceleration)
+        self.velocity_scale = copy_if_not_none(new_config.velocity_scale, self.velocity_scale)
+        self.acceleration_scale = copy_if_not_none(
+            new_config.acceleration_scale, self.acceleration_scale
+        )
+        self.jerk_scale = copy_if_not_none(new_config.jerk_scale, self.jerk_scale)
+        return self
+
     def clone(self) -> CSpaceConfig:
         return CSpaceConfig(
             joint_names=self.joint_names.copy(),
-            retract_config=copy_if_not_none(self.retract_config),
-            null_space_weight=copy_if_not_none(self.null_space_weight),
-            cspace_distance_weight=copy_if_not_none(self.cspace_distance_weight),
+            retract_config=clone_if_not_none(self.retract_config),
+            null_space_weight=clone_if_not_none(self.null_space_weight),
+            cspace_distance_weight=clone_if_not_none(self.cspace_distance_weight),
             tensor_args=self.tensor_args,
             max_jerk=self.max_jerk.clone(),
             max_acceleration=self.max_acceleration.clone(),
@@ -215,12 +242,48 @@ class KinematicsTensorConfig:
     cspace: Optional[CSpaceConfig] = None
     base_link: str = "base_link"
     ee_link: str = "ee_link"
+    reference_link_spheres: Optional[torch.Tensor] = None
 
     def __post_init__(self):
         if self.cspace is None and self.joint_limits is not None:
             self.load_cspace_cfg_from_kinematics()
         if self.joint_limits is not None and self.cspace is not None:
             self.joint_limits = self.cspace.scale_joint_limits(self.joint_limits)
+        if self.link_spheres is not None and self.reference_link_spheres is None:
+            self.reference_link_spheres = self.link_spheres.clone()
+
+    def copy_(self, new_config: KinematicsTensorConfig):
+        self.fixed_transforms.copy_(new_config.fixed_transforms)
+        self.link_map.copy_(new_config.link_map)
+        self.joint_map.copy_(new_config.joint_map)
+        self.joint_map_type.copy_(new_config.joint_map_type)
+        self.store_link_map.copy_(new_config.store_link_map)
+        self.link_chain_map.copy_(new_config.link_chain_map)
+        self.joint_limits.copy_(new_config.joint_limits)
+        if new_config.link_spheres is not None and self.link_spheres is not None:
+            self.link_spheres.copy_(new_config.link_spheres)
+        if new_config.link_sphere_idx_map is not None and self.link_sphere_idx_map is not None:
+            self.link_sphere_idx_map.copy_(new_config.link_sphere_idx_map)
+        if new_config.link_name_to_idx_map is not None and self.link_name_to_idx_map is not None:
+            self.link_name_to_idx_map = new_config.link_name_to_idx_map.copy()
+        if (
+            new_config.reference_link_spheres is not None
+            and self.reference_link_spheres is not None
+        ):
+            self.reference_link_spheres.copy_(new_config.reference_link_spheres)
+        self.base_link = new_config.base_link
+        self.ee_idx = new_config.ee_idx
+        self.ee_link = new_config.ee_link
+        self.debug = new_config.debug
+        self.cspace.copy_(new_config.cspace)
+        self.n_dof = new_config.n_dof
+        self.non_fixed_joint_names = new_config.non_fixed_joint_names
+        self.joint_names = new_config.joint_names
+        self.link_names = new_config.link_names
+        self.mesh_link_names = new_config.mesh_link_names
+        self.total_spheres = new_config.total_spheres
+
+        return self
 
     def load_cspace_cfg_from_kinematics(self):
         retract_config = (
@@ -270,6 +333,13 @@ class KinematicsTensorConfig:
         link_sphere_index = self.get_sphere_index_from_link_name(link_name)
         return self.link_spheres[link_sphere_index, :]
 
+    def get_reference_link_spheres(
+        self,
+        link_name: str,
+    ):
+        link_sphere_index = self.get_sphere_index_from_link_name(link_name)
+        return self.reference_link_spheres[link_sphere_index, :]
+
     def attach_object(
         self,
         sphere_radius: Optional[float] = None,
@@ -332,6 +402,19 @@ class KinematicsTensorConfig:
         """
         return self.get_link_spheres(link_name).shape[0]
 
+    def disable_link_spheres(self, link_name: str):
+        if link_name not in self.link_name_to_idx_map.keys():
+            raise ValueError(link_name + " not found in spheres")
+        curr_spheres = self.get_link_spheres(link_name)
+        curr_spheres[:, 3] = -100.0
+        self.update_link_spheres(link_name, curr_spheres)
+
+    def enable_link_spheres(self, link_name: str):
+        if link_name not in self.link_name_to_idx_map.keys():
+            raise ValueError(link_name + " not found in spheres")
+        curr_spheres = self.get_reference_link_spheres(link_name)
+        self.update_link_spheres(link_name, curr_spheres)
+
 
 @dataclass
 class SelfCollisionKinematicsConfig:
diff --git a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
index aa5b6e3..e116f68 100644
--- a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
+++ b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
@@ -183,6 +183,7 @@ class UrdfKinematicsParser(KinematicsParser):
         else:
             # convert to list:
             mesh_pose = Pose.from_matrix(mesh_pose).to_list()
+
         return CuroboMesh(
             name=link_name,
             pose=mesh_pose,
diff --git a/src/curobo/cuda_robot_model/usd_kinematics_parser.py b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
index d9dc978..2715b0c 100644
--- a/src/curobo/cuda_robot_model/usd_kinematics_parser.py
+++ b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
@@ -13,7 +13,6 @@ from typing import Dict, List, Optional, Tuple
 
 # Third Party
 import numpy as np
-from pxr import Usd, UsdPhysics
 
 # CuRobo
 from curobo.cuda_robot_model.kinematics_parser import KinematicsParser, LinkParams
@@ -22,6 +21,15 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.util.logger import log_error
 
+try:
+    # Third Party
+    from pxr import Usd, UsdPhysics
+except ImportError:
+    raise ImportError(
+        "usd-core failed to import, install with pip install usd-core"
+        + " NOTE: Do not install this if using with ISAAC SIM."
+    )
+
 
 class UsdKinematicsParser(KinematicsParser):
     """An experimental kinematics parser from USD.
diff --git a/src/curobo/curobolib/cpp/geom_cuda.cpp b/src/curobo/curobolib/cpp/geom_cuda.cpp
index cd69a6f..3775edb 100644
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -16,43 +16,55 @@
 
 // CUDA forward declarations
 
-std::vector<torch::Tensor> self_collision_distance(
-    torch::Tensor out_distance, torch::Tensor out_vec,
-    torch::Tensor sparse_index,
-    const torch::Tensor robot_spheres,      // batch_size x n_spheres x 4
-    const torch::Tensor collision_offset,   // n_spheres x n_spheres
-    const torch::Tensor weight,
-    const torch::Tensor collision_matrix, // n_spheres x n_spheres
-    const torch::Tensor thread_locations, const int locations_size,
-    const int batch_size, const int nspheres, const bool compute_grad = false,
-    const int ndpt = 8, // Does this need to match template?
-    const bool debug = false);
+std::vector<torch::Tensor>self_collision_distance(
+  torch::Tensor       out_distance,
+  torch::Tensor       out_vec,
+  torch::Tensor       sparse_index,
+  const torch::Tensor robot_spheres,    // batch_size x n_spheres x 4
+  const torch::Tensor collision_offset, // n_spheres x n_spheres
+  const torch::Tensor weight,
+  const torch::Tensor collision_matrix, // n_spheres x n_spheres
+  const torch::Tensor thread_locations,
+  const int           locations_size,
+  const int           batch_size,
+  const int           nspheres,
+  const bool          compute_grad = false,
+  const int           ndpt         = 8, // Does this need to match template?
+  const bool          debug        = false);
 
 // CUDA forward declarations
 
-std::vector<torch::Tensor> swept_sphere_obb_clpt(
-    const torch::Tensor sphere_position, // batch_size, 3
-    torch::Tensor distance, // batch_size, 1
-    torch::Tensor
-        closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4,
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
-    const bool transform_back, const bool compute_distance,
-    const bool use_batch_env);
+std::vector<torch::Tensor>swept_sphere_obb_clpt(
+  const torch::Tensor sphere_position, // batch_size, 3
+  torch::Tensor       distance,        // batch_size, 1
+  torch::Tensor
+  closest_point,                       // batch size, 4 -> written out as x,y,z,0 for gradient
+  torch::Tensor       sparsity_idx,
+  const torch::Tensor weight,
+  const torch::Tensor activation_distance,
+  const torch::Tensor speed_dt,
+  const torch::Tensor obb_accel,     // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,    // n_boxes, 3
+  const torch::Tensor obb_pose,      // n_boxes, 4, 4
+  const torch::Tensor obb_enable,    // n_boxes, 4,
+  const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+  const torch::Tensor env_query_idx, // n_boxes, 4, 4
+  const int           max_nobs,
+  const int           batch_size,
+  const int           horizon,
+  const int           n_spheres,
+  const int           sweep_steps,
+  const bool          enable_speed_metric,
+  const bool          transform_back,
+  const bool          compute_distance,
+  const bool          use_batch_env);
 
 std::vector<torch::Tensor>
 sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
-                torch::Tensor distance,
-                torch::Tensor closest_point, // batch size, 3
-                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                torch::Tensor       distance,
+                torch::Tensor       closest_point,   // batch size, 3
+                torch::Tensor       sparsity_idx,
+                const torch::Tensor weight,
                 const torch::Tensor activation_distance,
                 const torch::Tensor obb_accel,     // n_boxes, 4, 4
                 const torch::Tensor obb_bounds,    // n_boxes, 3
@@ -60,58 +72,75 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
                 const torch::Tensor obb_enable,    // n_boxes, 4, 4
                 const torch::Tensor n_env_obb,     // n_boxes, 4, 4
                 const torch::Tensor env_query_idx, // n_boxes, 4, 4
-                const int max_nobs, const int batch_size, const int horizon,
-                const int n_spheres, const bool transform_back,
-                const bool compute_distance, const bool use_batch_env);
+                const int           max_nobs,
+                const int           batch_size,
+                const int           horizon,
+                const int           n_spheres,
+                const bool          transform_back,
+                const bool          compute_distance,
+                const bool          use_batch_env);
 
-std::vector<torch::Tensor> pose_distance(
-    torch::Tensor out_distance, torch::Tensor out_position_distance,
-    torch::Tensor out_rotation_distance,
-    torch::Tensor distance_p_vector, // batch size, 3
-    torch::Tensor distance_q_vector, // batch size, 4
-    torch::Tensor out_gidx,
-    const torch::Tensor current_position, // batch_size, 3
-    const torch::Tensor goal_position,    // n_boxes, 3
-    const torch::Tensor current_quat, const torch::Tensor goal_quat,
-    const torch::Tensor vec_weight, // n_boxes, 4, 4
-    const torch::Tensor weight,     // n_boxes, 4, 4
-    const torch::Tensor vec_convergence, const torch::Tensor run_weight,
-    const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
-    const int batch_size, const int horizon, const int mode,
-    const int num_goals = 1, const bool compute_grad = false,
-    const bool write_distance = true, const bool use_metric = false);
+std::vector<torch::Tensor>pose_distance(
+  torch::Tensor       out_distance,
+  torch::Tensor       out_position_distance,
+  torch::Tensor       out_rotation_distance,
+  torch::Tensor       distance_p_vector, // batch size, 3
+  torch::Tensor       distance_q_vector, // batch size, 4
+  torch::Tensor       out_gidx,
+  const torch::Tensor current_position,  // batch_size, 3
+  const torch::Tensor goal_position,     // n_boxes, 3
+  const torch::Tensor current_quat,
+  const torch::Tensor goal_quat,
+  const torch::Tensor vec_weight,        // n_boxes, 4, 4
+  const torch::Tensor weight,            // n_boxes, 4, 4
+  const torch::Tensor vec_convergence,
+  const torch::Tensor run_weight,
+  const torch::Tensor run_vec_weight,
+  const torch::Tensor offset_waypoint,
+  const torch::Tensor offset_tstep_fraction,
+  const torch::Tensor batch_pose_idx,
+  const int           batch_size,
+  const int           horizon,
+  const int           mode,
+  const int           num_goals        = 1,
+  const bool          compute_grad     = false,
+  const bool          write_distance   = true,
+  const bool          use_metric       = false,
+  const bool          project_distance = true);
 
 std::vector<torch::Tensor>
-backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+backward_pose_distance(torch::Tensor       out_grad_p,
+                       torch::Tensor       out_grad_q,
                        const torch::Tensor grad_distance,   // batch_size, 3
                        const torch::Tensor grad_p_distance, // n_boxes, 3
                        const torch::Tensor grad_q_distance,
                        const torch::Tensor pose_weight,
-                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
-                       const torch::Tensor grad_q_vec, const int batch_size,
-                       const bool use_distance = false);
+                       const torch::Tensor grad_p_vec,      // n_boxes, 4, 4
+                       const torch::Tensor grad_q_vec,
+                       const int           batch_size,
+                       const bool          use_distance = false);
 
 // C++ interface
 
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
 
-std::vector<torch::Tensor> self_collision_distance_wrapper(
-    torch::Tensor out_distance, torch::Tensor out_vec,
-    torch::Tensor sparse_index,
-    const torch::Tensor robot_spheres,        // batch_size x n_spheres x 4
-    const torch::Tensor collision_offset,     // n_spheres
-    const torch::Tensor weight,
-    const torch::Tensor collision_matrix, // n_spheres
-    const torch::Tensor thread_locations, const int thread_locations_size,
-    const int batch_size, const int nspheres, const bool compute_grad = false,
-    const int ndpt = 8, const bool debug = false) {
-
+std::vector<torch::Tensor>self_collision_distance_wrapper(
+  torch::Tensor out_distance, torch::Tensor out_vec,
+  torch::Tensor sparse_index,
+  const torch::Tensor robot_spheres,    // batch_size x n_spheres x 4
+  const torch::Tensor collision_offset, // n_spheres
+  const torch::Tensor weight,
+  const torch::Tensor collision_matrix, // n_spheres
+  const torch::Tensor thread_locations, const int thread_locations_size,
+  const int batch_size, const int nspheres, const bool compute_grad = false,
+  const int ndpt = 8, const bool debug = false)
+{
   CHECK_INPUT(out_distance);
   CHECK_INPUT(out_vec);
   CHECK_INPUT(robot_spheres);
@@ -123,29 +152,30 @@ std::vector<torch::Tensor> self_collision_distance_wrapper(
   const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());
 
   return self_collision_distance(
-      out_distance, out_vec, sparse_index, robot_spheres, 
-      collision_offset, weight, collision_matrix, thread_locations,
-      thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
+    out_distance, out_vec, sparse_index, robot_spheres,
+    collision_offset, weight, collision_matrix, thread_locations,
+    thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
 }
 
-std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
-    const torch::Tensor sphere_position, // batch_size, 4
-
-    torch::Tensor distance,
-    torch::Tensor closest_point, // batch size, 3
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4, 4
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const bool transform_back, const bool compute_distance,
-    const bool use_batch_env) {
+std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
+  const torch::Tensor sphere_position, // batch_size, 4
 
+  torch::Tensor distance,
+  torch::Tensor closest_point,         // batch size, 3
+  torch::Tensor sparsity_idx, const torch::Tensor weight,
+  const torch::Tensor activation_distance,
+  const torch::Tensor obb_accel,       // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,      // n_boxes, 3
+  const torch::Tensor obb_pose,        // n_boxes, 4, 4
+  const torch::Tensor obb_enable,      // n_boxes, 4, 4
+  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
+  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+  const int max_nobs, const int batch_size, const int horizon,
+  const int n_spheres, const bool transform_back, const bool compute_distance,
+  const bool use_batch_env)
+{
   const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
   CHECK_INPUT(distance);
   CHECK_INPUT(closest_point);
   CHECK_INPUT(sphere_position);
@@ -154,55 +184,61 @@ std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
   CHECK_INPUT(activation_distance);
   CHECK_INPUT(obb_accel);
   return sphere_obb_clpt(
-      sphere_position, distance, closest_point, sparsity_idx, weight,
-      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
-      n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
-      transform_back, compute_distance, use_batch_env);
+    sphere_position, distance, closest_point, sparsity_idx, weight,
+    activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+    n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
+    transform_back, compute_distance, use_batch_env);
 }
-std::vector<torch::Tensor> swept_sphere_obb_clpt_wrapper(
-    const torch::Tensor sphere_position, // batch_size, 4
-    torch::Tensor distance,
-    torch::Tensor closest_point, // batch size, 3
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4, 4
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
-    const bool transform_back, const bool compute_distance,
-    const bool use_batch_env) {
+
+std::vector<torch::Tensor>swept_sphere_obb_clpt_wrapper(
+  const torch::Tensor sphere_position, // batch_size, 4
+  torch::Tensor distance,
+  torch::Tensor closest_point,         // batch size, 3
+  torch::Tensor sparsity_idx, const torch::Tensor weight,
+  const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+  const torch::Tensor obb_accel,       // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,      // n_boxes, 3
+  const torch::Tensor obb_pose,        // n_boxes, 4, 4
+  const torch::Tensor obb_enable,      // n_boxes, 4, 4
+  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
+  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+  const int max_nobs, const int batch_size, const int horizon,
+  const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+  const bool transform_back, const bool compute_distance,
+  const bool use_batch_env)
+{
   const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
   CHECK_INPUT(distance);
   CHECK_INPUT(closest_point);
   CHECK_INPUT(sphere_position);
 
   return swept_sphere_obb_clpt(
-      sphere_position,
-      distance, closest_point, sparsity_idx, weight, activation_distance,
-      speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
-      env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
-      enable_speed_metric, transform_back, compute_distance, use_batch_env);
+    sphere_position,
+    distance, closest_point, sparsity_idx, weight, activation_distance,
+    speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
+    env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
+    enable_speed_metric, transform_back, compute_distance, use_batch_env);
 }
-std::vector<torch::Tensor> pose_distance_wrapper(
-    torch::Tensor out_distance, torch::Tensor out_position_distance,
-    torch::Tensor out_rotation_distance,
-    torch::Tensor distance_p_vector, // batch size, 3
-    torch::Tensor distance_q_vector, // batch size, 4
-    torch::Tensor out_gidx,
-    const torch::Tensor current_position, // batch_size, 3
-    const torch::Tensor goal_position,    // n_boxes, 3
-    const torch::Tensor current_quat, const torch::Tensor goal_quat,
-    const torch::Tensor vec_weight, // n_boxes, 4, 4
-    const torch::Tensor weight, const torch::Tensor vec_convergence,
-    const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
-    const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
-    const int mode, const int num_goals = 1, const bool compute_grad = false,
-    const bool write_distance = false, const bool use_metric = false) {
 
+std::vector<torch::Tensor>pose_distance_wrapper(
+  torch::Tensor out_distance, torch::Tensor out_position_distance,
+  torch::Tensor out_rotation_distance,
+  torch::Tensor distance_p_vector,      // batch size, 3
+  torch::Tensor distance_q_vector,      // batch size, 4
+  torch::Tensor out_gidx,
+  const torch::Tensor current_position, // batch_size, 3
+  const torch::Tensor goal_position,    // n_boxes, 3
+  const torch::Tensor current_quat, const torch::Tensor goal_quat,
+  const torch::Tensor vec_weight,       // n_boxes, 4, 4
+  const torch::Tensor weight, const torch::Tensor vec_convergence,
+  const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
+  const torch::Tensor offset_waypoint, const torch::Tensor offset_tstep_fraction,
+  const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
+  const int mode, const int num_goals = 1, const bool compute_grad = false,
+  const bool write_distance = false, const bool use_metric = false,
+  const bool project_distance = true)
+{
   // at::cuda::DeviceGuard guard(angle.device());
   CHECK_INPUT(out_distance);
   CHECK_INPUT(out_position_distance);
@@ -214,24 +250,30 @@ std::vector<torch::Tensor> pose_distance_wrapper(
   CHECK_INPUT(current_quat);
   CHECK_INPUT(goal_quat);
   CHECK_INPUT(batch_pose_idx);
+  CHECK_INPUT(offset_waypoint);
+  CHECK_INPUT(offset_tstep_fraction);
   const at::cuda::OptionalCUDAGuard guard(current_position.device());
 
   return pose_distance(
-      out_distance, out_position_distance, out_rotation_distance,
-      distance_p_vector, distance_q_vector, out_gidx, current_position,
-      goal_position, current_quat, goal_quat, vec_weight, weight,
-      vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
-      horizon, mode, num_goals, compute_grad, write_distance, use_metric);
+    out_distance, out_position_distance, out_rotation_distance,
+    distance_p_vector, distance_q_vector, out_gidx, current_position,
+    goal_position, current_quat, goal_quat, vec_weight, weight,
+    vec_convergence, run_weight, run_vec_weight,
+    offset_waypoint,
+    offset_tstep_fraction,
+    batch_pose_idx, batch_size,
+    horizon, mode, num_goals, compute_grad, write_distance, use_metric, project_distance);
 }
 
-std::vector<torch::Tensor> backward_pose_distance_wrapper(
-    torch::Tensor out_grad_p, torch::Tensor out_grad_q,
-    const torch::Tensor grad_distance,   // batch_size, 3
-    const torch::Tensor grad_p_distance, // n_boxes, 3
-    const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
-    const torch::Tensor grad_p_vec, // n_boxes, 4, 4
-    const torch::Tensor grad_q_vec, const int batch_size,
-    const bool use_distance) {
+std::vector<torch::Tensor>backward_pose_distance_wrapper(
+  torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+  const torch::Tensor grad_distance,   // batch_size, 3
+  const torch::Tensor grad_p_distance, // n_boxes, 3
+  const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
+  const torch::Tensor grad_p_vec,      // n_boxes, 4, 4
+  const torch::Tensor grad_q_vec, const int batch_size,
+  const bool use_distance)
+{
   CHECK_INPUT(out_grad_p);
   CHECK_INPUT(out_grad_q);
   CHECK_INPUT(grad_distance);
@@ -241,18 +283,19 @@ std::vector<torch::Tensor> backward_pose_distance_wrapper(
   const at::cuda::OptionalCUDAGuard guard(grad_distance.device());
 
   return backward_pose_distance(
-      out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
-      pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
+    out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
+    pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
-  m.def("pose_distance_backward", &backward_pose_distance_wrapper,
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("pose_distance",           &pose_distance_wrapper, "Pose Distance (curobolib)");
+  m.def("pose_distance_backward",  &backward_pose_distance_wrapper,
         "Pose Distance Backward (curobolib)");
 
-  m.def("closest_point", &sphere_obb_clpt_wrapper,
+  m.def("closest_point",           &sphere_obb_clpt_wrapper,
         "Closest Point OBB(curobolib)");
-  m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
+  m.def("swept_closest_point",     &swept_sphere_obb_clpt_wrapper,
         "Swept Closest Point OBB(curobolib)");
 
   m.def("self_collision_distance", &self_collision_distance_wrapper,
diff --git a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
index c73bff9..b7dda16 100644
--- a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
+++ b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
@@ -16,91 +16,112 @@
 // CUDA forward declarations
 
 std::vector<torch::Tensor>
-matrix_to_quaternion(torch::Tensor out_quat,
+matrix_to_quaternion(torch::Tensor       out_quat,
                      const torch::Tensor in_rot // batch_size, 3
-);
+                     );
 
-std::vector<torch::Tensor> kin_fused_forward(
-    torch::Tensor link_pos, torch::Tensor link_quat,
-    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
-    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
-    const torch::Tensor robot_spheres, const torch::Tensor link_map,
-    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
-    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-    const int batch_size, const int n_spheres,
-    const bool use_global_cumul = false);
+std::vector<torch::Tensor>kin_fused_forward(
+  torch::Tensor       link_pos,
+  torch::Tensor       link_quat,
+  torch::Tensor       batch_robot_spheres,
+  torch::Tensor       global_cumul_mat,
+  const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres,
+  const torch::Tensor link_map,
+  const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map,
+  const int           batch_size,
+  const int           n_spheres,
+  const bool          use_global_cumul = false);
+
+std::vector<torch::Tensor>kin_fused_backward_16t(
+  torch::Tensor       grad_out,
+  const torch::Tensor grad_nlinks_pos,
+  const torch::Tensor grad_nlinks_quat,
+  const torch::Tensor grad_spheres,
+  const torch::Tensor global_cumul_mat,
+  const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres,
+  const torch::Tensor link_map,
+  const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map,
+  const torch::Tensor link_chain_map,
+  const int           batch_size,
+  const int           n_spheres,
+  const bool          sparsity_opt     = true,
+  const bool          use_global_cumul = false);
 
-std::vector<torch::Tensor> kin_fused_backward_16t(
-    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
-    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
-    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
-    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
-    const torch::Tensor link_map, const torch::Tensor joint_map,
-    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
-    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
-    const bool use_global_cumul = false);
 // C++ interface
 
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
 
-std::vector<torch::Tensor> kin_forward_wrapper(
-    torch::Tensor link_pos, torch::Tensor link_quat,
-    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
-    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
-    const torch::Tensor robot_spheres, const torch::Tensor link_map,
-    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
-    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-    const int batch_size, const int n_spheres,
-    const bool use_global_cumul = false) {
-
+std::vector<torch::Tensor>kin_forward_wrapper(
+  torch::Tensor link_pos, torch::Tensor link_quat,
+  torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+  const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres, const torch::Tensor link_map,
+  const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+  const int batch_size, const int n_spheres,
+  const bool use_global_cumul = false)
+{
   const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
 
   // TODO: add check input
   return kin_fused_forward(
-      link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
-      fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
-      store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
+    link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
+    fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
+    store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
 }
 
-std::vector<torch::Tensor> kin_backward_wrapper(
-    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
-    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
-    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
-    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
-    const torch::Tensor link_map, const torch::Tensor joint_map,
-    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
-    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
-    const bool use_global_cumul = false) {
+std::vector<torch::Tensor>kin_backward_wrapper(
+  torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+  const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+  const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+  const torch::Tensor link_map, const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+  const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+  const bool use_global_cumul = false)
+{
   const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
 
   return kin_fused_backward_16t(
-      grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
-      global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
-      joint_map, joint_map_type, store_link_map, link_sphere_map,
-      link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
+    grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
+    global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
+    joint_map, joint_map_type, store_link_map, link_sphere_map,
+    link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
 }
 
 std::vector<torch::Tensor>
-matrix_to_quaternion_wrapper(torch::Tensor out_quat,
+matrix_to_quaternion_wrapper(torch::Tensor       out_quat,
                              const torch::Tensor in_rot // batch_size, 3
-) {
+                             )
+{
   const at::cuda::OptionalCUDAGuard guard(in_rot.device());
+
   CHECK_INPUT(in_rot);
   CHECK_INPUT(out_quat);
   return matrix_to_quaternion(out_quat, in_rot);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
-  m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("forward",              &kin_forward_wrapper,  "Kinematics fused forward (CUDA)");
+  m.def("backward",             &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
   m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
         "Rotation Matrix to Quaternion (CUDA)");
 }
diff --git a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
index 9b2b56c..4b71f57 100644
--- a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
+++ b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
@@ -35,1003 +35,1172 @@
 #define Y_ROT_NEG 10
 #define Z_ROT_NEG 11
 
-#define MAX_BATCH_PER_BLOCK 32   // tunable parameter for improving occupancy
+#define MAX_BATCH_PER_BLOCK 32    // tunable parameter for improving occupancy
 #define MAX_BW_BATCH_PER_BLOCK 16 // tunable parameter for improving occupancy
 
-#define MAX_TOTAL_LINKS                                                        \
-  750 // limited by shared memory size. We need to fit 16 * float32 per link
+#define MAX_TOTAL_LINKS \
+  750                             // limited by shared memory size. We need to fit 16 * float32 per
+                                  // link
 
-namespace Curobo {
-
-namespace Kinematics {
-
-template <typename psum_t>
-__device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
-                                                float3 scale, psum_t &sum_out) {
-  sum_out += scale.x * (a.y * b.z - a.z * b.y) +
-             scale.y * (a.z * b.x - a.x * b.z) +
-             scale.z * (a.x * b.y - a.y * b.x);
-}
-
-__device__ __forceinline__ void normalize_quaternion(float *q) {
-  // get length:
-  float length = 1.0 / norm4df(q[0], q[1], q[2], q[3]);
-  if (q[0] < 0.0) {
-    length = -1.0 * length;
-  }
-
-  q[1] = length * q[1];
-  q[2] = length * q[2];
-  q[3] = length * q[3];
-  q[0] = length * q[0];
-}
-/**
- * @brief get quaternion from transformation matrix
- *
- * @param t transformation matrix 4x4
- * @param q quaternion in wxyz format
- */
-__device__ __forceinline__ void mat_to_quat(float *t, float *q) {
-  float n;
-  float n_sqrt;
-  if (t[10] < 0.0) {
-    if (t[0] > t[5]) {
-      n = 1 + t[0] - t[5] - t[10];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = n * n_sqrt;
-      q[2] = (t[1] + t[4]) * n_sqrt;
-      q[3] = (t[8] + t[2]) * n_sqrt;
-      q[0] = -1 * (t[6] - t[9]) * n_sqrt; // * -1 ; // this is the wrong one?
-    } else {
-      n = 1 - t[0] + t[5] - t[10];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = (t[1] + t[4]) * n_sqrt;
-      q[2] = n * n_sqrt;
-      q[3] = (t[6] + t[9]) * n_sqrt;
-      q[0] = -1 * (t[8] - t[2]) * n_sqrt;
+namespace Curobo
+{
+  namespace Kinematics
+  {
+    template<typename psum_t>
+    __device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
+                                                    float3 scale, psum_t& sum_out)
+    {
+      sum_out += scale.x * (a.y * b.z - a.z * b.y) +
+                 scale.y * (a.z * b.x - a.x * b.z) +
+                 scale.z * (a.x * b.y - a.y * b.x);
     }
-  } else {
-    if (t[0] < -1 * t[5]) {
-      n = 1 - t[0] - t[5] + t[10];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = (t[8] + t[2]) * n_sqrt;
-      q[2] = (t[6] + t[9]) * n_sqrt;
-      q[3] = n * n_sqrt;
-      q[0] = -1 * (t[1] - t[4]) * n_sqrt;
-    } else {
-      n = 1 + t[0] + t[5] + t[10];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = (t[6] - t[9]) * n_sqrt;
-      q[2] = (t[8] - t[2]) * n_sqrt;
-      q[3] = (t[1] - t[4]) * n_sqrt;
-      q[0] = -1 * n * n_sqrt;
-    }
-  }
-  normalize_quaternion(q);
-}
 
-/**
- * @brief get quaternion from transformation matrix
- *
- * @param t # rotation matrix 3x3
- * @param q quaternion in wxyz format
- */
-__device__ __forceinline__ void rot_to_quat(float *t, float *q) {
-  float n;
-  float n_sqrt;
-  if (t[8] < 0.0) {
-    if (t[0] > t[4]) {
-      n = 1 + t[0] - t[4] - t[8];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = n * n_sqrt;
-      q[2] = (t[1] + t[3]) * n_sqrt;
-      q[3] = (t[6] + t[2]) * n_sqrt;
-      q[0] = -1 * (t[5] - t[7]) * n_sqrt; // * -1 ; // this is the wrong one?
-    } else {
-      n = 1 - t[0] + t[4] - t[8];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = (t[1] + t[3]) * n_sqrt;
-      q[2] = n * n_sqrt;
-      q[3] = (t[5] + t[7]) * n_sqrt;
-      q[0] = -1 * (t[6] - t[2]) * n_sqrt;
-    }
-  } else {
-    if (t[0] < -1 * t[4]) {
-      n = 1 - t[0] - t[4] + t[8];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = (t[6] + t[2]) * n_sqrt;
-      q[2] = (t[5] + t[7]) * n_sqrt;
-      q[3] = n * n_sqrt;
-      q[0] = -1 * (t[1] - t[3]) * n_sqrt;
-    } else {
-      n = 1 + t[0] + t[4] + t[8];
-      n_sqrt = 0.5 * rsqrtf(n);
-      q[1] = (t[5] - t[7]) * n_sqrt;
-      q[2] = (t[6] - t[2]) * n_sqrt;
-      q[3] = (t[1] - t[3]) * n_sqrt;
-      q[0] = -1 * n * n_sqrt;
-    }
-  }
-  normalize_quaternion(q);
-}
+    __device__ __forceinline__ void normalize_quaternion(float *q)
+    {
+      // get length:
+      float length = 1.0 / norm4df(q[0], q[1], q[2], q[3]);
 
-__device__ __forceinline__ void rot_mul(float *r1, float *r2, float *r_out) {
-  for (int i = 0; i < 9; i++) {
-    r_out[i] = 0.0;
-  }
+      if (q[0] < 0.0)
+      {
+        length = -1.0 * length;
+      }
+
+      q[1] = length * q[1];
+      q[2] = length * q[2];
+      q[3] = length * q[3];
+      q[0] = length * q[0];
+    }
+
+    /**
+     * @brief get quaternion from transformation matrix
+     *
+     * @param t transformation matrix 4x4
+     * @param q quaternion in wxyz format
+     */
+    __device__ __forceinline__ void mat_to_quat(float *t, float *q)
+    {
+      float n;
+      float n_sqrt;
+
+      if (t[10] < 0.0)
+      {
+        if (t[0] > t[5])
+        {
+          n      = 1 + t[0] - t[5] - t[10];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = n * n_sqrt;
+          q[2]   = (t[1] + t[4]) * n_sqrt;
+          q[3]   = (t[8] + t[2]) * n_sqrt;
+          q[0]   = -1 * (t[6] - t[9]) * n_sqrt; // * -1 ; // this is the wrong one?
+        }
+        else
+        {
+          n      = 1 - t[0] + t[5] - t[10];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = (t[1] + t[4]) * n_sqrt;
+          q[2]   = n * n_sqrt;
+          q[3]   = (t[6] + t[9]) * n_sqrt;
+          q[0]   = -1 * (t[8] - t[2]) * n_sqrt;
+        }
+      }
+      else
+      {
+        if (t[0] < -1 * t[5])
+        {
+          n      = 1 - t[0] - t[5] + t[10];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = (t[8] + t[2]) * n_sqrt;
+          q[2]   = (t[6] + t[9]) * n_sqrt;
+          q[3]   = n * n_sqrt;
+          q[0]   = -1 * (t[1] - t[4]) * n_sqrt;
+        }
+        else
+        {
+          n      = 1 + t[0] + t[5] + t[10];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = (t[6] - t[9]) * n_sqrt;
+          q[2]   = (t[8] - t[2]) * n_sqrt;
+          q[3]   = (t[1] - t[4]) * n_sqrt;
+          q[0]   = -1 * n * n_sqrt;
+        }
+      }
+      normalize_quaternion(q);
+    }
+
+    /**
+     * @brief get quaternion from transformation matrix
+     *
+     * @param t # rotation matrix 3x3
+     * @param q quaternion in wxyz format
+     */
+    __device__ __forceinline__ void rot_to_quat(float *t, float *q)
+    {
+      float n;
+      float n_sqrt;
+
+      if (t[8] < 0.0)
+      {
+        if (t[0] > t[4])
+        {
+          n      = 1 + t[0] - t[4] - t[8];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = n * n_sqrt;
+          q[2]   = (t[1] + t[3]) * n_sqrt;
+          q[3]   = (t[6] + t[2]) * n_sqrt;
+          q[0]   = -1 * (t[5] - t[7]) * n_sqrt; // * -1 ; // this is the wrong one?
+        }
+        else
+        {
+          n      = 1 - t[0] + t[4] - t[8];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = (t[1] + t[3]) * n_sqrt;
+          q[2]   = n * n_sqrt;
+          q[3]   = (t[5] + t[7]) * n_sqrt;
+          q[0]   = -1 * (t[6] - t[2]) * n_sqrt;
+        }
+      }
+      else
+      {
+        if (t[0] < -1 * t[4])
+        {
+          n      = 1 - t[0] - t[4] + t[8];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = (t[6] + t[2]) * n_sqrt;
+          q[2]   = (t[5] + t[7]) * n_sqrt;
+          q[3]   = n * n_sqrt;
+          q[0]   = -1 * (t[1] - t[3]) * n_sqrt;
+        }
+        else
+        {
+          n      = 1 + t[0] + t[4] + t[8];
+          n_sqrt = 0.5 * rsqrtf(n);
+          q[1]   = (t[5] - t[7]) * n_sqrt;
+          q[2]   = (t[6] - t[2]) * n_sqrt;
+          q[3]   = (t[1] - t[3]) * n_sqrt;
+          q[0]   = -1 * n * n_sqrt;
+        }
+      }
+      normalize_quaternion(q);
+    }
+
+    __device__ __forceinline__ void rot_mul(float *r1, float *r2, float *r_out)
+    {
+      for (int i = 0; i < 9; i++)
+      {
+        r_out[i] = 0.0;
+      }
 #pragma unroll
-  for (int k = 0; k < 3; k++) {
+
+      for (int k = 0; k < 3; k++)
+      {
 #pragma unroll
-    for (int j = 0; j < 3; j++) {
+
+        for (int j = 0; j < 3; j++)
+        {
 #pragma unroll
-      for (int i = 0; i < 3; i++) {
-        r_out[i * 3 + j] += r1[i * 3 + k] * r2[k * 3 + j];
+
+          for (int i = 0; i < 3; i++)
+          {
+            r_out[i * 3 + j] += r1[i * 3 + k] * r2[k * 3 + j];
+          }
+        }
       }
     }
-  }
-}
-__device__ __forceinline__ void rot_inverse_rot_mul(float *r1, float *r2,
-                                                    float *r_out) {
-  // multiply two matrices:
-  r_out[0] = r1[0] * r2[0] + r1[4] * r2[4] + r1[8] * r2[8];
-  r_out[1] = r1[0] * r2[1] + r1[4] * r2[5] + r1[8] * r2[9];
-  r_out[2] = r1[0] * r2[2] + r1[4] * r2[6] + r1[8] * r2[10];
 
-  r_out[3] = r1[1] * r2[0] + r1[5] * r2[4] + r1[9] * r2[8];
-  r_out[4] = r1[1] * r2[1] + r1[5] * r2[5] + r1[9] * r2[9];
-  r_out[5] = r1[1] * r2[2] + r1[5] * r2[6] + r1[9] * r2[10];
+    __device__ __forceinline__ void rot_inverse_rot_mul(float *r1, float *r2,
+                                                        float *r_out)
+    {
+      // multiply two matrices:
+      r_out[0] = r1[0] * r2[0] + r1[4] * r2[4] + r1[8] * r2[8];
+      r_out[1] = r1[0] * r2[1] + r1[4] * r2[5] + r1[8] * r2[9];
+      r_out[2] = r1[0] * r2[2] + r1[4] * r2[6] + r1[8] * r2[10];
 
-  r_out[6] = r1[2] * r2[0] + r1[6] * r2[4] + r1[10] * r2[8];
-  r_out[7] = r1[2] * r2[1] + r1[6] * r2[5] + r1[10] * r2[9];
-  r_out[8] = r1[2] * r2[2] + r1[6] * r2[6] + r1[10] * r2[10];
-}
+      r_out[3] = r1[1] * r2[0] + r1[5] * r2[4] + r1[9] * r2[8];
+      r_out[4] = r1[1] * r2[1] + r1[5] * r2[5] + r1[9] * r2[9];
+      r_out[5] = r1[1] * r2[2] + r1[5] * r2[6] + r1[9] * r2[10];
+
+      r_out[6] = r1[2] * r2[0] + r1[6] * r2[4] + r1[10] * r2[8];
+      r_out[7] = r1[2] * r2[1] + r1[6] * r2[5] + r1[10] * r2[9];
+      r_out[8] = r1[2] * r2[2] + r1[6] * r2[6] + r1[10] * r2[10];
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void
+    transform_sphere(const float *transform_mat, const scalar_t *sphere, float *C)
+    {
+      float4 sphere_pos = *(float4 *)&sphere[0];
+      int    st_idx     = 0;
 
-template <typename scalar_t>
-__device__ __forceinline__ void
-transform_sphere(const float *transform_mat, const scalar_t *sphere, float *C) {
-  float4 sphere_pos = *(float4 *)&sphere[0];
-  int st_idx = 0;
 #pragma unroll 3
-  for (int i = 0; i < 3; i++) {
-    st_idx = i * 4;
-    // do dot product:
-    // C[i] = transform_mat[st_idx] * sphere_pos.x + transform_mat[st_idx+1] *
-    // sphere_pos.y + transform_mat[st_idx+2] * sphere_pos.z +
-    // transform_mat[st_idx + 3];
-    float4 tm = *(float4 *)&transform_mat[st_idx];
-    C[i] =
-        tm.x * sphere_pos.x + tm.y * sphere_pos.y + tm.z * sphere_pos.z + tm.w;
-  }
-  C[3] = sphere_pos.w;
-}
 
-template <typename scalar_t>
-__device__ __forceinline__ void fixed_joint_fn(const scalar_t *fixedTransform,
-                                               float *JM) {
-  JM[0] = fixedTransform[0];
-  JM[1] = fixedTransform[M];
-  JM[2] = fixedTransform[M * 2];
-  JM[3] = fixedTransform[M * 3];
-}
+      for (int i = 0; i < 3; i++)
+      {
+        st_idx = i * 4;
 
-// prism_fn withOUT control flow
-template <typename scalar_t>
-__device__ __forceinline__ void prism_fn(const scalar_t *fixedTransform,
-                                         const float angle, const int col_idx,
-                                         float *JM, const int xyz) {
-  // 	int _and = (col_idx & (col_idx>>1)) & 0x1; // 1 for thread 3, 0 for all
-  // other threads (0,1,2)
-  //
-  // 	float f1 = (1-_and) + _and * angle; // 1 for threads 0,1,2; angle for
-  // thread 3 	int addr_offset = (1-_and) * col_idx + _and * xyz; // col_idx
-  // for threads 0,1,2; xyz for thread 3
-  //
-  // 	JM[0] = fixedTransform[0 + addr_offset] * f1 + _and *
-  // fixedTransform[3];//FT_0[1]; 	JM[1] = fixedTransform[M + addr_offset]
-  // * f1 + _and * fixedTransform[M + 3];//FT_1[1]; 	JM[2] = fixedTransform[M
-  // + M + addr_offset] * f1 + _and * fixedTransform[M + M + 3];//FT_2[1];
-  // JM[3] = fixedTransform[M + M + M + addr_offset] * (1-_and) + _and * 1; //
-  // first three threads will get fixedTransform[3M+col_idx], the last thread
-  // will get 1
-
-  if (col_idx <= 2) {
-    fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
-  } else {
-    JM[0] = fixedTransform[0 + xyz] * angle + fixedTransform[3];     // FT_0[1];
-    JM[1] = fixedTransform[M + xyz] * angle + fixedTransform[M + 3]; // FT_1[1];
-    JM[2] = fixedTransform[M + M + xyz] * angle +
-            fixedTransform[M + M + 3]; // FT_2[1];
-    JM[3] = 1;
-  }
-}
-
-__device__ __forceinline__ void update_joint_type_direction(int &j_type, int8_t &axis_sign)
-{
-  // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-
-  // Don't do anything if j_type < 6. j_type range is [0, 11]
-  // divergence here.
-  axis_sign = 1;
-  if (j_type >= 6) 
-  {
-    j_type -= 6;
-    axis_sign = -1;
-  }
-}
-
-
-__device__ __forceinline__ void update_joint_type_direction(int &j_type)
-{
-  // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-
-  // Don't do anything if j_type < 6. j_type range is [0, 11]
-  // divergence here.
-  if (j_type >= 6) 
-  {
-    j_type -= 6;
-  }
-}
-
-__device__ __forceinline__ void update_axis_direction(
-  float &angle, 
-  int &j_type)
-{
-  // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-  // sign should be +ve <= 5 and -ve >5
-  // j_type range is [0, 11].
-  // cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 = +ve,
-  // then this code should be j_type - 5.
-  angle = -1 * copysignf(1.0, j_type - 6) * angle; 
-  
-  update_joint_type_direction(j_type);
-  
-}
-
-// In the following versions of rot_fn, some non-nan values may become nan as we
-// add multiple values instead of using if-else/switch-case.
-
-// version with no control flow
-template <typename scalar_t>
-__device__ __forceinline__ void xrot_fn(const scalar_t *fixedTransform,
-                                        const float angle, const int col_idx,
-                                        float *JM) {
-  // we found no change in convergence between fast approximate and IEEE sin,
-  // cos functions using fast approximate method saves 5 registers per thread.
-  float cos = __cosf(angle);
-  float sin = __sinf(angle);
-  float n_sin = -1 * sin;
-
-  int bit1 = col_idx & 0x1;
-  int bit2 = (col_idx & 0x2) >> 1;
-  int _xor = bit1 ^ bit2; // 0 for threads 0 and 3, 1 for threads 1 and 2
-  int col_idx_by_2 =
-      col_idx / 2; // 0 for threads 0 and 1, 1 for threads 2 and 3
-
-  float f1 = (1 - col_idx_by_2) * cos +
-             col_idx_by_2 * n_sin; // thread 1 get cos , thread 2 gets n_sin
-  float f2 = (1 - col_idx_by_2) * sin +
-             col_idx_by_2 * cos; // thread 1 get sin, thread 2 gets cos
-
-  f1 = _xor * f1 + (1 - _xor) * 1; // threads 1 and 2 will get f1; the other
-                                   // two threads will get 1
-  f2 = _xor *
-       f2; // threads 1 and 2 will get f2, the other two threads will get 0.0
-  float f3 = 1 - _xor;
-
-  int addr_offset =
-      _xor + (1 - _xor) *
-                 col_idx; // 1 for threads 1 and 2, col_idx for threads 0 and 3
-
-  JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
-  JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
-  JM[2] =
-      fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
-  JM[3] = fixedTransform[M + M + M + addr_offset] *
-          f3; // threads 1 and 2 get 0.0, remaining two get fixedTransform[3M];
-}
-
-// version with no control flow
-template <typename scalar_t>
-__device__ __forceinline__ void yrot_fn(const scalar_t *fixedTransform,
-                                        const float angle, const int col_idx,
-                                        float *JM) {
-  float cos = __cosf(angle);
-  float sin = __sinf(angle);
-  float n_sin = -1 * sin;
-
-  int col_idx_per_2 =
-      col_idx % 2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
-  int col_idx_by_2 =
-      col_idx / 2; // threads 0 and 1 will be 0 and threads 2 and 3 will be 1.
-
-  float f1 = (1 - col_idx_by_2) * cos +
-             col_idx_by_2 * sin; // thread 0 get cos , thread 2 gets sin
-  float f2 = (1 - col_idx_by_2) * n_sin +
-             col_idx_by_2 * cos; // thread 0 get n_sin, thread 2 gets cos
-
-  f1 = (1 - col_idx_per_2) * f1 +
-       col_idx_per_2 * 1; // threads 0 and 2 will get f1; the other two
-                          // threads will get 1
-  f2 = (1 - col_idx_per_2) *
-       f2; // threads 0 and 2 will get f2, the other two threads will get 0.0
-  float f3 =
-      col_idx_per_2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
-
-  int addr_offset =
-      col_idx_per_2 *
-      col_idx; // threads 0 and 2 will get 0, the other two will get col_idx.
-
-  JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
-  JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
-  JM[2] =
-      fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
-  JM[3] = fixedTransform[M + M + M + addr_offset] *
-          f3; // threads 0 and 2 threads get 0.0, remaining two get
-              // fixedTransform[3M];
-}
-
-// version with no control flow
-template <typename scalar_t>
-__device__ __forceinline__ void zrot_fn(const scalar_t *fixedTransform,
-                                        const float angle, const int col_idx,
-                                        float *JM) {
-  float cos = __cosf(angle);
-  float sin = __sinf(angle);
-  float n_sin = -1 * sin;
-
-  int col_idx_by_2 =
-      col_idx / 2; // first two threads will be 0 and the next two will be 1.
-  int col_idx_per_2 =
-      col_idx % 2; // first thread will be 0 and the second thread will be 1.
-  float f1 = (1 - col_idx_per_2) * cos +
-             col_idx_per_2 * n_sin; // thread 0 get cos , thread 1 gets n_sin
-  float f2 = (1 - col_idx_per_2) * sin +
-             col_idx_per_2 * cos; // thread 0 get sin, thread 1 gets cos
-
-  f1 = (1 - col_idx_by_2) * f1 +
-       col_idx_by_2 * 1; // first two threads get f1, other two threads get 1
-  f2 = (1 - col_idx_by_2) *
-       f2; // first two threads get f2, other two threads get 0.0
-
-  int addr_offset =
-      col_idx_by_2 *
-      col_idx; // first 2 threads will get 0, the other two will get col_idx.
-
-  JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[1];
-  JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 1];
-  JM[2] =
-      fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 1];
-  JM[3] = fixedTransform[M + M + M + addr_offset] *
-          col_idx_by_2; // first two threads get 0.0, remaining two get
-                        // fixedTransform[3M];
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void
-rot_backward_translation(const float3 &vec, float *cumul_mat, float *l_pos,
-                         const float3 &loc_grad, psum_t &grad_q, const int8_t axis_sign=1) {
-
-  float3 e_pos, j_pos;
-
-  e_pos.x = cumul_mat[3];
-  e_pos.y = cumul_mat[4 + 3];
-  e_pos.z = cumul_mat[4 + 4 + 3];
-
-  // compute position gradient:
-  j_pos = *(float3 *)&l_pos[0] - e_pos;          // - e_pos;
-  float3 scale_grad = axis_sign * loc_grad;
-  scale_cross_sum(vec, j_pos, scale_grad, grad_q); // cross product
-}
-
-
-template <typename psum_t>
-__device__ __forceinline__ void
-rot_backward_rotation(const float3 vec, const float3 grad_vec, psum_t &grad_q, const int8_t axis_sign=1) {
-  grad_q += axis_sign * dot(vec, grad_vec);
-}
-
-
-template <typename psum_t>
-__device__ __forceinline__ void
-prism_backward_translation(const float3 vec, const float3 grad_vec,
-                           psum_t &grad_q, const int8_t axis_sign=1) {
-  grad_q += axis_sign * dot(vec, grad_vec);
-}
-template <typename psum_t>
-__device__ __forceinline__ void
-z_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
-               float3 &loc_grad_orientation, psum_t &grad_q, const int8_t axis_sign=1) {
-  float3 vec =
-      make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
-
-  // get rotation vector:
-  rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
-                           loc_grad_position, grad_q, axis_sign);
-
-  rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void
-x_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
-               float3 &loc_grad_orientation, psum_t &grad_q,  const int8_t axis_sign=1) {
-  float3 vec =
-      make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
-
-  // get rotation vector:
-  rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
-                           loc_grad_position, grad_q, axis_sign);
-
-  rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
-}
-template <typename psum_t>
-__device__ __forceinline__ void
-y_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
-               float3 &loc_grad_orientation, psum_t &grad_q,  const int8_t axis_sign=1) {
-  float3 vec =
-      make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
-
-  // get rotation vector:
-  rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
-                           loc_grad_position, grad_q, axis_sign);
-
-  rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void
-xyz_prism_backward_translation(float *cumul_mat, float3 &loc_grad,
-                               psum_t &grad_q, int xyz,  const int8_t axis_sign=1) {
-  prism_backward_translation(
-      make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
-      loc_grad, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void x_prism_backward_translation(float *cumul_mat,
-                                                             float3 &loc_grad,
-                                                             psum_t &grad_q,
-                                                            const int8_t axis_sign=1) {
-  // get rotation vector:
-  prism_backward_translation(
-      make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), loc_grad, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void y_prism_backward_translation(float *cumul_mat,
-                                                             float3 &loc_grad,
-                                                             psum_t &grad_q,
-                                                            const int8_t axis_sign=1) {
-  // get rotation vector:
-  prism_backward_translation(
-      make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), loc_grad, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void z_prism_backward_translation(float *cumul_mat,
-                                                             float3 &loc_grad,
-                                                             psum_t &grad_q,
-                                                            const int8_t axis_sign=1) {
-  // get rotation vector:
-  prism_backward_translation(
-      make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
-}
-
-__device__ __forceinline__ void
-xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
-                             float &grad_q, int xyz,  const int8_t axis_sign=1) {
-  // get rotation vector:
-  rot_backward_translation(
-      make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
-      &cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void
-x_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
-                           psum_t &grad_q,  const int8_t axis_sign=1) {
-  // get rotation vector:
-  rot_backward_translation(
-      make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), &cumul_mat[0],
-      &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void
-y_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
-                           psum_t &grad_q,  const int8_t axis_sign=1) {
-  // get rotation vector:
-  rot_backward_translation(
-      make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), &cumul_mat[0],
-      &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-template <typename psum_t>
-__device__ __forceinline__ void
-z_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
-                           psum_t &grad_q,  const int8_t axis_sign=1) {
-  // get rotation vector:
-  rot_backward_translation(
-      make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), &cumul_mat[0],
-      &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-// An optimized version of kin_fused_warp_kernel.
-// This one should be about 10% faster.
-template <typename scalar_t, bool use_global_cumul>
-__global__ void
-kin_fused_warp_kernel2(scalar_t *link_pos, // batchSize xz store_n_links x M x M
-                       scalar_t *link_quat, // batchSize x store_n_links x M x M
-                       scalar_t *b_robot_spheres, // batchSize x nspheres x M
-                       scalar_t *global_cumul_mat,
-                       const scalar_t *q,              // batchSize x njoints
-                       const scalar_t *fixedTransform, // nlinks x M x M
-                       const scalar_t *robot_spheres,  // nspheres x M
-                       const int8_t *jointMapType,     // nlinks
-                       const int16_t *jointMap,        // nlinks
-                       const int16_t *linkMap,         // nlinks
-                       const int16_t *storeLinkMap,    // store_n_links
-                       const int16_t *linkSphereMap,   // nspheres
-                       const int batchSize, const int nspheres,
-                       const int nlinks, const int njoints,
-                       const int store_n_links) {
-  extern __shared__ float cumul_mat[];
-
-  int t = blockDim.x * blockIdx.x + threadIdx.x;
-  const int batch = t / 4;
-  if (batch >= batchSize)
-    return;
-
-  int col_idx = threadIdx.x % 4;
-  const int local_batch = threadIdx.x / 4;
-  const int matAddrBase = local_batch * nlinks * M * M;
-  // read all fixed transforms to local cache:
-
-  // copy base link transform:
-  *(float4 *)&cumul_mat[matAddrBase + col_idx * M] =
-      *(float4 *)&fixedTransform[col_idx * M];
-
-  if (use_global_cumul) {
-    *(float4 *)&global_cumul_mat[batch * nlinks * 16 + col_idx * M] =
-        *(float4 *)&cumul_mat[matAddrBase + col_idx * M];
-  }
-  for (int8_t l = 1; l < nlinks; l++) //
-  {
-
-    // get one row of fixedTransform
-    int ftAddrStart = l * M * M;
-    int inAddrStart = matAddrBase + linkMap[l] * M * M;
-    int outAddrStart = matAddrBase + l * M * M;
-
-    // row index:
-    // check joint type and use one of the helper functions:
-    float JM[M];
-    int j_type = jointMapType[l];
-
-    if (j_type == FIXED)
-    {
-      fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
-    } 
-    else
-    {
-      float angle = q[batch * njoints + jointMap[l]];
-      update_axis_direction(angle, j_type);
-      if (j_type <= Z_PRISM) {
-        prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
-      } else if (j_type == X_ROT) {
-        xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
-      } else if (j_type == Y_ROT) {
-        yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
-      } else if (j_type == Z_ROT) {
-        zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
-      } else {
-        assert(j_type >= FIXED && j_type <= Z_ROT);
+        // do dot product:
+        // C[i] = transform_mat[st_idx] * sphere_pos.x + transform_mat[st_idx+1] *
+        // sphere_pos.y + transform_mat[st_idx+2] * sphere_pos.z +
+        // transform_mat[st_idx + 3];
+        float4 tm = *(float4 *)&transform_mat[st_idx];
+        C[i] =
+          tm.x * sphere_pos.x + tm.y * sphere_pos.y + tm.z * sphere_pos.z + tm.w;
       }
-    }    
+      C[3] = sphere_pos.w;
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void fixed_joint_fn(const scalar_t *fixedTransform,
+                                                   float          *JM)
+    {
+      JM[0] = fixedTransform[0];
+      JM[1] = fixedTransform[M];
+      JM[2] = fixedTransform[M * 2];
+      JM[3] = fixedTransform[M * 3];
+    }
+
+    // prism_fn withOUT control flow
+    template<typename scalar_t>
+    __device__ __forceinline__ void prism_fn(const scalar_t *fixedTransform,
+                                             const float angle, const int col_idx,
+                                             float *JM, const int xyz)
+    {
+      //        int _and = (col_idx & (col_idx>>1)) & 0x1; // 1 for thread 3, 0 for all
+      // other threads (0,1,2)
+      //
+      //        float f1 = (1-_and) + _and * angle; // 1 for threads 0,1,2; angle for
+      // thread 3       int addr_offset = (1-_and) * col_idx + _and * xyz; // col_idx
+      // for threads 0,1,2; xyz for thread 3
+      //
+      //        JM[0] = fixedTransform[0 + addr_offset] * f1 + _and *
+      // fixedTransform[3];//FT_0[1];   JM[1] = fixedTransform[M + addr_offset]
+      // * f1 + _and * fixedTransform[M + 3];//FT_1[1];         JM[2] = fixedTransform[M
+      // + M + addr_offset] * f1 + _and * fixedTransform[M + M + 3];//FT_2[1];
+      // JM[3] = fixedTransform[M + M + M + addr_offset] * (1-_and) + _and * 1; //
+      // first three threads will get fixedTransform[3M+col_idx], the last thread
+      // will get 1
+
+      if (col_idx <= 2)
+      {
+        fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
+      }
+      else
+      {
+        JM[0] = fixedTransform[0 + xyz] * angle + fixedTransform[3];     // FT_0[1];
+        JM[1] = fixedTransform[M + xyz] * angle + fixedTransform[M + 3]; // FT_1[1];
+        JM[2] = fixedTransform[M + M + xyz] * angle +
+                fixedTransform[M + M + 3];                               // FT_2[1];
+        JM[3] = 1;
+      }
+    }
+
+    __device__ __forceinline__ void update_joint_type_direction(int& j_type, int8_t& axis_sign)
+    {
+      // Assume that input j_type >= 0 . Check fixed joint outside of this function.
+
+      // Don't do anything if j_type < 6. j_type range is [0, 11]
+      // divergence here.
+      axis_sign = 1;
+
+      if (j_type >= 6)
+      {
+        j_type   -= 6;
+        axis_sign = -1;
+      }
+    }
+
+    __device__ __forceinline__ void update_joint_type_direction(int& j_type)
+    {
+      // Assume that input j_type >= 0 . Check fixed joint outside of this function.
+
+      // Don't do anything if j_type < 6. j_type range is [0, 11]
+      // divergence here.
+      if (j_type >= 6)
+      {
+        j_type -= 6;
+      }
+    }
+
+    __device__ __forceinline__ void update_axis_direction(
+      float& angle,
+      int  & j_type)
+    {
+      // Assume that input j_type >= 0 . Check fixed joint outside of this function.
+      // sign should be +ve <= 5 and -ve >5
+      // j_type range is [0, 11].
+      // cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 =
+      // +ve,
+      // then this code should be j_type - 5.
+      angle = -1 * copysignf(1.0, j_type - 6) * angle;
+
+      update_joint_type_direction(j_type);
+    }
+
+    // In the following versions of rot_fn, some non-nan values may become nan as we
+    // add multiple values instead of using if-else/switch-case.
+
+    // version with no control flow
+    template<typename scalar_t>
+    __device__ __forceinline__ void xrot_fn(const scalar_t *fixedTransform,
+                                            const float angle, const int col_idx,
+                                            float *JM)
+    {
+      // we found no change in convergence between fast approximate and IEEE sin,
+      // cos functions using fast approximate method saves 5 registers per thread.
+      float cos   = __cosf(angle);
+      float sin   = __sinf(angle);
+      float n_sin = -1 * sin;
+
+      int bit1         = col_idx & 0x1;
+      int bit2         = (col_idx & 0x2) >> 1;
+      int _xor         = bit1 ^ bit2;  // 0 for threads 0 and 3, 1 for threads 1 and 2
+      int col_idx_by_2 =
+        col_idx / 2;                   // 0 for threads 0 and 1, 1 for threads 2 and 3
+
+      float f1 = (1 - col_idx_by_2) * cos +
+                 col_idx_by_2 * n_sin; // thread 1 get cos , thread 2 gets n_sin
+      float f2 = (1 - col_idx_by_2) * sin +
+                 col_idx_by_2 * cos;   // thread 1 get sin, thread 2 gets cos
+
+      f1 = _xor * f1 + (1 - _xor) * 1; // threads 1 and 2 will get f1; the other
+                                       // two threads will get 1
+      f2 = _xor *
+           f2;                         // threads 1 and 2 will get f2, the other two threads will
+                                       // get 0.0
+      float f3 = 1 - _xor;
+
+      int addr_offset =
+        _xor + (1 - _xor) *
+        col_idx; // 1 for threads 1 and 2, col_idx for threads 0 and 3
+
+      JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
+      JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
+      JM[2] =
+        fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
+      JM[3] = fixedTransform[M + M + M + addr_offset] *
+              f3; // threads 1 and 2 get 0.0, remaining two get fixedTransform[3M];
+    }
+
+    // version with no control flow
+    template<typename scalar_t>
+    __device__ __forceinline__ void yrot_fn(const scalar_t *fixedTransform,
+                                            const float angle, const int col_idx,
+                                            float *JM)
+    {
+      float cos   = __cosf(angle);
+      float sin   = __sinf(angle);
+      float n_sin = -1 * sin;
+
+      int col_idx_per_2 =
+        col_idx % 2;                 // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
+      int col_idx_by_2 =
+        col_idx / 2;                 // threads 0 and 1 will be 0 and threads 2 and 3 will be 1.
+
+      float f1 = (1 - col_idx_by_2) * cos +
+                 col_idx_by_2 * sin; // thread 0 get cos , thread 2 gets sin
+      float f2 = (1 - col_idx_by_2) * n_sin +
+                 col_idx_by_2 * cos; // thread 0 get n_sin, thread 2 gets cos
+
+      f1 = (1 - col_idx_per_2) * f1 +
+           col_idx_per_2 * 1;        // threads 0 and 2 will get f1; the other two
+                                     // threads will get 1
+      f2 = (1 - col_idx_per_2) *
+           f2;                       // threads 0 and 2 will get f2, the other two threads will get
+                                     // 0.0
+      float f3 =
+        col_idx_per_2;               // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
+
+      int addr_offset =
+        col_idx_per_2 *
+        col_idx; // threads 0 and 2 will get 0, the other two will get col_idx.
+
+      JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
+      JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
+      JM[2] =
+        fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
+      JM[3] = fixedTransform[M + M + M + addr_offset] *
+              f3; // threads 0 and 2 threads get 0.0, remaining two get
+                  // fixedTransform[3M];
+    }
+
+    // version with no control flow
+    template<typename scalar_t>
+    __device__ __forceinline__ void zrot_fn(const scalar_t *fixedTransform,
+                                            const float angle, const int col_idx,
+                                            float *JM)
+    {
+      float cos   = __cosf(angle);
+      float sin   = __sinf(angle);
+      float n_sin = -1 * sin;
+
+      int col_idx_by_2 =
+        col_idx / 2;                    // first two threads will be 0 and the next two will be 1.
+      int col_idx_per_2 =
+        col_idx % 2;                    // first thread will be 0 and the second thread will be 1.
+      float f1 = (1 - col_idx_per_2) * cos +
+                 col_idx_per_2 * n_sin; // thread 0 get cos , thread 1 gets n_sin
+      float f2 = (1 - col_idx_per_2) * sin +
+                 col_idx_per_2 * cos;   // thread 0 get sin, thread 1 gets cos
+
+      f1 = (1 - col_idx_by_2) * f1 +
+           col_idx_by_2 * 1;            // first two threads get f1, other two threads get 1
+      f2 = (1 - col_idx_by_2) *
+           f2;                          // first two threads get f2, other two threads get 0.0
+
+      int addr_offset =
+        col_idx_by_2 *
+        col_idx; // first 2 threads will get 0, the other two will get col_idx.
+
+      JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[1];
+      JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 1];
+      JM[2] =
+        fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 1];
+      JM[3] = fixedTransform[M + M + M + addr_offset] *
+              col_idx_by_2; // first two threads get 0.0, remaining two get
+                            // fixedTransform[3M];
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    rot_backward_translation(const float3& vec, float *cumul_mat, float *l_pos,
+                             const float3& loc_grad, psum_t& grad_q, const int8_t axis_sign = 1)
+    {
+      float3 e_pos, j_pos;
+
+      e_pos.x = cumul_mat[3];
+      e_pos.y = cumul_mat[4 + 3];
+      e_pos.z = cumul_mat[4 + 4 + 3];
+
+      // compute position gradient:
+      j_pos = *(float3 *)&l_pos[0] - e_pos;            // - e_pos;
+      float3 scale_grad = axis_sign * loc_grad;
+      scale_cross_sum(vec, j_pos, scale_grad, grad_q); // cross product
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    rot_backward_rotation(const float3 vec,
+                          const float3 grad_vec,
+                          psum_t     & grad_q,
+                          const int8_t axis_sign = 1)
+    {
+      grad_q += axis_sign * dot(vec, grad_vec);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    prism_backward_translation(const float3 vec, const float3 grad_vec,
+                               psum_t& grad_q, const int8_t axis_sign = 1)
+    {
+      grad_q += axis_sign * dot(vec, grad_vec);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    z_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
+                   float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
+    {
+      float3 vec =
+        make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
+
+      // get rotation vector:
+      rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+                               loc_grad_position, grad_q, axis_sign);
+
+      rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    x_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
+                   float3& loc_grad_orientation, psum_t& grad_q,  const int8_t axis_sign = 1)
+    {
+      float3 vec =
+        make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
+
+      // get rotation vector:
+      rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+                               loc_grad_position, grad_q, axis_sign);
+
+      rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    y_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
+                   float3& loc_grad_orientation, psum_t& grad_q,  const int8_t axis_sign = 1)
+    {
+      float3 vec =
+        make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
+
+      // get rotation vector:
+      rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+                               loc_grad_position, grad_q, axis_sign);
+
+      rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    xyz_prism_backward_translation(float *cumul_mat, float3& loc_grad,
+                                   psum_t& grad_q, int xyz,  const int8_t axis_sign = 1)
+    {
+      prism_backward_translation(
+        make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
+        loc_grad, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void x_prism_backward_translation(float       *cumul_mat,
+                                                                 float3     & loc_grad,
+                                                                 psum_t     & grad_q,
+                                                                 const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      prism_backward_translation(
+        make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), loc_grad, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void y_prism_backward_translation(float       *cumul_mat,
+                                                                 float3     & loc_grad,
+                                                                 psum_t     & grad_q,
+                                                                 const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      prism_backward_translation(
+        make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), loc_grad, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void z_prism_backward_translation(float       *cumul_mat,
+                                                                 float3     & loc_grad,
+                                                                 psum_t     & grad_q,
+                                                                 const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      prism_backward_translation(
+        make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
+    }
+
+    __device__ __forceinline__ void
+    xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+                                 float& grad_q, int xyz,  const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      rot_backward_translation(
+        make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
+        &cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    x_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+                               psum_t& grad_q,  const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      rot_backward_translation(
+        make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), &cumul_mat[0],
+        &l_pos[0], loc_grad, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    y_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+                               psum_t& grad_q,  const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      rot_backward_translation(
+        make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), &cumul_mat[0],
+        &l_pos[0], loc_grad, grad_q, axis_sign);
+    }
+
+    template<typename psum_t>
+    __device__ __forceinline__ void
+    z_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+                               psum_t& grad_q,  const int8_t axis_sign = 1)
+    {
+      // get rotation vector:
+      rot_backward_translation(
+        make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), &cumul_mat[0],
+        &l_pos[0], loc_grad, grad_q, axis_sign);
+    }
+
+    // An optimized version of kin_fused_warp_kernel.
+    // This one should be about 10% faster.
+    template<typename scalar_t, bool use_global_cumul>
+    __global__ void
+    kin_fused_warp_kernel2(scalar_t *link_pos,             // batchSize xz store_n_links x M x M
+                           scalar_t *link_quat,            // batchSize x store_n_links x M x M
+                           scalar_t *b_robot_spheres,      // batchSize x nspheres x M
+                           scalar_t *global_cumul_mat,
+                           const scalar_t *q,              // batchSize x njoints
+                           const scalar_t *fixedTransform, // nlinks x M x M
+                           const scalar_t *robot_spheres,  // nspheres x M
+                           const int8_t *jointMapType,     // nlinks
+                           const int16_t *jointMap,        // nlinks
+                           const int16_t *linkMap,         // nlinks
+                           const int16_t *storeLinkMap,    // store_n_links
+                           const int16_t *linkSphereMap,   // nspheres
+                           const int batchSize, const int nspheres,
+                           const int nlinks, const int njoints,
+                           const int store_n_links)
+    {
+      extern __shared__ float cumul_mat[];
+
+      int t           = blockDim.x * blockIdx.x + threadIdx.x;
+      const int batch = t / 4;
+
+      if (batch >= batchSize)
+        return;
+
+      int col_idx           = threadIdx.x % 4;
+      const int local_batch = threadIdx.x / 4;
+      const int matAddrBase = local_batch * nlinks * M * M;
+
+      // read all fixed transforms to local cache:
+
+      // copy base link transform:
+      *(float4 *)&cumul_mat[matAddrBase + col_idx * M] =
+        *(float4 *)&fixedTransform[col_idx * M];
+
+      if (use_global_cumul)
+      {
+        *(float4 *)&global_cumul_mat[batch * nlinks * 16 + col_idx * M] =
+          *(float4 *)&cumul_mat[matAddrBase + col_idx * M];
+      }
+
+      for (int8_t l = 1; l < nlinks; l++) //
+      {
+        // get one row of fixedTransform
+        int ftAddrStart  = l * M * M;
+        int inAddrStart  = matAddrBase + linkMap[l] * M * M;
+        int outAddrStart = matAddrBase + l * M * M;
+
+        // row index:
+        // check joint type and use one of the helper functions:
+        float JM[M];
+        int   j_type = jointMapType[l];
+
+        if (j_type == FIXED)
+        {
+          fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
+        }
+        else
+        {
+          float angle = q[batch * njoints + jointMap[l]];
+          update_axis_direction(angle, j_type);
+
+          if (j_type <= Z_PRISM)
+          {
+            prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
+          }
+          else if (j_type == X_ROT)
+          {
+            xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+          }
+          else if (j_type == Y_ROT)
+          {
+            yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+          }
+          else if (j_type == Z_ROT)
+          {
+            zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+          }
+          else
+          {
+            assert(j_type >= FIXED && j_type <= Z_ROT);
+          }
+        }
 
 #pragma unroll 4
-    for (int i = 0; i < M; i++) {
-      cumul_mat[outAddrStart + (i * M) + col_idx] =
-          dot(*(float4 *)&cumul_mat[inAddrStart + (i * M)], *(float4 *)JM);
-    }
 
-    if (use_global_cumul) {
-      *(float4 *)&global_cumul_mat[batch * nlinks * 16 + l * 16 + col_idx * M] =
-          *(float4 *)&cumul_mat[outAddrStart + col_idx * M];
-    }
-  }
-  // write out link:
-
-  // do robot_spheres
-
-  const int batchAddrs = batch * nspheres * 4;
-  // read cumul mat index to run for this thread:
-  int16_t read_cumul_idx = -1;
-  int16_t spheres_perthread = (nspheres + 3) / 4;
-  for (int16_t i = 0; i < spheres_perthread; i++) {
-    //const int16_t sph_idx = col_idx * spheres_perthread + i;
-    const int16_t sph_idx = col_idx + i * 4;
-    // const int8_t sph_idx =
-    //     i * 4 + col_idx; // different order such that adjacent
-    //  spheres are in neighboring threads
-    if (sph_idx >= nspheres) {
-      break;
-    }
-    // read cumul idx:
-    read_cumul_idx = linkSphereMap[sph_idx];
-    float spheres_mem[4];
-    const int16_t sphAddrs = sph_idx * 4;
-
-    transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
-                     &robot_spheres[sphAddrs], &spheres_mem[0]);
-    *(float4 *)&b_robot_spheres[batchAddrs + sphAddrs] =
-        *(float4 *)&spheres_mem[0];
-  }
-
-  // write position and rotation, we convert rotation matrix to a quaternion and
-  // write it out
-  for (int16_t i = 0; i < store_n_links; i++) {
-    int16_t l_map = storeLinkMap[i];
-    int l_outAddrStart =
-        (batch * store_n_links); // * 7) + i * 7;// + (t % M) * M;
-    int outAddrStart = matAddrBase + l_map * M * M;
-
-    float quat[4];
-    // TODO: spread the work to different threads. For now all the threads will
-    // do the same work.
-    mat_to_quat(
-        &cumul_mat[outAddrStart],
-        &quat[0]); // get quaternion, all the 4 threads will do the same work
-    link_quat[l_outAddrStart * 4 + i * 4 + col_idx] =
-        quat[col_idx]; // one thread will write one element to memory
-
-    if (col_idx < 3) {
-      // threads 0,1,2 will execute the following store
-      link_pos[l_outAddrStart * 3 + i * 3 + col_idx] =
-          cumul_mat[outAddrStart + 3 + (col_idx)*4];
-    }
-  }
-}
-
-// kin_fused_backward_kernel3 uses 16 threads per batch, instead of 4 per batch
-// as in kin_fused_backward_kernel2.
-template <typename scalar_t, typename psum_t, bool use_global_cumul,
-          bool enable_sparsity_opt, int16_t MAX_JOINTS, bool PARALLEL_WRITE>
-__global__ void kin_fused_backward_kernel3(
-    scalar_t *grad_out_link_q,       // batchSize * njoints
-    const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
-    const scalar_t *grad_nlinks_quat,
-    const scalar_t *grad_spheres, // batchSize * nspheres * 4
-    const scalar_t *global_cumul_mat,
-    const scalar_t *q,              // batchSize * njoints
-    const scalar_t *fixedTransform, // nlinks * 16
-    const scalar_t *robotSpheres,   // batchSize * nspheres * 4
-    const int8_t *jointMapType,     // nlinks
-    const int16_t *jointMap,        // nlinks
-    const int16_t *linkMap,         // nlinks
-    const int16_t *storeLinkMap,    // store_n_links
-    const int16_t *linkSphereMap,   // nspheres
-    const int16_t *linkChainMap,    // nlinks*nlinks
-    const int batchSize, const int nspheres, const int nlinks,
-    const int njoints, const int store_n_links) {
-  extern __shared__ float cumul_mat[];
-
-  int t = blockDim.x * blockIdx.x + threadIdx.x;
-  const int batch = t / 16;
-  unsigned mask = __ballot_sync(0xffffffff, batch < batchSize);
-
-  if (batch >= batchSize)
-    return;
-
-  // Each thread computes one element of the cumul_mat.
-  // first 4 threads compute a row of the output;
-  const int elem_idx = threadIdx.x % 16;
-  const int col_idx = elem_idx % 4;
-  const int local_batch = threadIdx.x / 16;
-  const int matAddrBase = local_batch * nlinks * M * M;
-
-  if (use_global_cumul) {
-    for (int l = 0; l < nlinks; l++) {
-      int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
-      cumul_mat[outAddrStart + elem_idx] =
-          global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
-    }
-  } else {
-
-    cumul_mat[matAddrBase + elem_idx] = fixedTransform[elem_idx];
-
-    for (int l = 1; l < nlinks; l++) // TODO: add base link transform
-    {
-      float JM[M];                 // store one row locally for mat-mul
-      int ftAddrStart = l * M * M; //+ (t % M) * M;
-      int inAddrStart = matAddrBase + linkMap[l] * M * M;
-      int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
-
-      int j_type = jointMapType[l];
-
-      
-      if (j_type == FIXED) {
-        fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
-      } 
-      else {
-        float angle = q[batch * njoints + jointMap[l]];
-
-        update_axis_direction(angle, j_type);
-
-        if (j_type <= Z_PRISM) {
-          prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
-        } else if (j_type == X_ROT) {
-          xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
-        } else if (j_type == Y_ROT) {
-          yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
-        } else if (j_type == Z_ROT) {
-          zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
-        } else {
-          assert(j_type >= FIXED && j_type <= Z_ROT);
+        for (int i = 0; i < M; i++)
+        {
+          cumul_mat[outAddrStart + (i * M) + col_idx] =
+            dot(*(float4 *)&cumul_mat[inAddrStart + (i * M)], *(float4 *)JM);
+        }
+
+        if (use_global_cumul)
+        {
+          *(float4 *)&global_cumul_mat[batch * nlinks * 16 + l * 16 + col_idx * M] =
+            *(float4 *)&cumul_mat[outAddrStart + col_idx * M];
         }
-      
       }
-      // fetch one row of cumul_mat, multiply with a column, which is in JM
-      cumul_mat[outAddrStart + elem_idx] =
-          dot(*(float4 *)&cumul_mat[inAddrStart + ((elem_idx / 4) * M)],
-              *(float4 *)JM);
-    }
-  }
-  // thread-local partial sum accumulators
-  // We would like to keep these partial sums in register file and avoid memory
-  // accesses
-  psum_t psum_grad[MAX_JOINTS]; // MAX_JOINTS
-// we are allocating a lot larger array. So, we will be initilizing just the
-// portion we need.
-#pragma unroll
-  for (int i = 0; i < njoints; i++) {
-    psum_grad[i] = 0.0;
-  }
 
-  // read cumul mat index to run for this thread:
-  int read_cumul_idx = -1;
+      // write out link:
 
-  const int spheres_perthread = (nspheres + 15) / 16;
+      // do robot_spheres
 
-  for (int i = 0; i < spheres_perthread; i++) {
-    //const int sph_idx = elem_idx * spheres_perthread + i;
-    const int sph_idx = elem_idx + i * 16;
-    if (sph_idx >= nspheres) {
-      break;
-    }
-    const int sphAddrs = sph_idx * 4;
-    const int batchAddrs = batch * nspheres * 4;
-    float4 loc_grad_sphere_t = *(float4 *)&grad_spheres[batchAddrs + sphAddrs];
-    // Sparsity-based optimization: Skip zero computation
-    if (enable_sparsity_opt) {
-      if (loc_grad_sphere_t.x == 0 && loc_grad_sphere_t.y == 0 &&
-          loc_grad_sphere_t.z == 0) {
-        continue;
-      }
-    }
-    float3 loc_grad_sphere = make_float3(
-        loc_grad_sphere_t.x, loc_grad_sphere_t.y, loc_grad_sphere_t.z);
+      const int batchAddrs = batch * nspheres * 4;
 
-    // read cumul idx:
-    read_cumul_idx = linkSphereMap[sph_idx];
-    float spheres_mem[4];
-    transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
-                     &robotSpheres[sphAddrs], &spheres_mem[0]);
-    // assuming this sphere only depends on links lower than this index
-    // This could be relaxed by making read_cumul_idx = number of links.
-    //const int16_t loop_max = read_cumul_idx;
-    const int16_t loop_max = nlinks - 1; 
-    for (int j = loop_max; j > -1; j--) {
-      if (linkChainMap[read_cumul_idx * nlinks + j] == 0.0) {
-        continue;
-      }
-      int8_t axis_sign = 1;
+      // read cumul mat index to run for this thread:
+      int16_t read_cumul_idx    = -1;
+      int16_t spheres_perthread = (nspheres + 3) / 4;
 
-      int j_type = jointMapType[j];
-      if(j_type != FIXED)
+      for (int16_t i = 0; i < spheres_perthread; i++)
       {
-        update_joint_type_direction(j_type, axis_sign);
+        // const int16_t sph_idx = col_idx * spheres_perthread + i;
+        const int16_t sph_idx = col_idx + i * 4;
+
+        // const int8_t sph_idx =
+        //     i * 4 + col_idx; // different order such that adjacent
+        //  spheres are in neighboring threads
+        if (sph_idx >= nspheres)
+        {
+          break;
+        }
+
+        // read cumul idx:
+        read_cumul_idx = linkSphereMap[sph_idx];
+        float spheres_mem[4];
+        const int16_t sphAddrs = sph_idx * 4;
+
+        transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
+                         &robot_spheres[sphAddrs], &spheres_mem[0]);
+        *(float4 *)&b_robot_spheres[batchAddrs + sphAddrs] =
+          *(float4 *)&spheres_mem[0];
       }
-      if (j_type == Z_ROT) {
-        float result = 0.0;
-        z_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
-                                   &spheres_mem[0], loc_grad_sphere, result, axis_sign);
-        psum_grad[jointMap[j]] += (psum_t)result;
-      } else if (j_type >= X_PRISM && j_type <= Z_PRISM) {
-        float result = 0.0;
-        xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
-                                       loc_grad_sphere, result, j_type, axis_sign);
-        psum_grad[jointMap[j]] += (psum_t)result;
-      } else if (j_type == X_ROT) {
-        float result = 0.0;
-        x_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
-                                   &spheres_mem[0], loc_grad_sphere, result, axis_sign);
-        psum_grad[jointMap[j]] += (psum_t)result;
-      } else if (j_type == Y_ROT) {
-        float result = 0.0;
-        y_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
-                                   &spheres_mem[0], loc_grad_sphere, result, axis_sign);
-        psum_grad[jointMap[j]] += (psum_t)result;
+
+      // write position and rotation, we convert rotation matrix to a quaternion and
+      // write it out
+      for (int16_t i = 0; i < store_n_links; i++)
+      {
+        int16_t l_map          = storeLinkMap[i];
+        int     l_outAddrStart =
+          (batch * store_n_links); // * 7) + i * 7;// + (t % M) * M;
+        int outAddrStart = matAddrBase + l_map * M * M;
+
+        float quat[4];
+
+        // TODO: spread the work to different threads. For now all the threads will
+        // do the same work.
+        mat_to_quat(
+          &cumul_mat[outAddrStart],
+          &quat[0]);     // get quaternion, all the 4 threads will do the same work
+        link_quat[l_outAddrStart * 4 + i * 4 + col_idx] =
+          quat[col_idx]; // one thread will write one element to memory
+
+        if (col_idx < 3)
+        {
+          // threads 0,1,2 will execute the following store
+          link_pos[l_outAddrStart * 3 + i * 3 + col_idx] =
+            cumul_mat[outAddrStart + 3 + (col_idx) * 4];
+        }
       }
     }
-  }
 
-
-  // Instead of accumulating the sphere_grad and link_grad separately, we will
-  // accumulate them together once below.
-  //
-  // // accumulate across 4 threads using shuffle operation
-  // for(int j=0; j<njoints; j++)
-  // {
-  //   psum_t sum = psum_grad[j];
-  //   sum += __shfl_down_sync(mask, sum, 1);
-  //   sum += __shfl_down_sync(mask, sum, 2);
-
-  //   // thread 0 (col_idx==0) has the accumulated psum_grad[j] in sum
-  //   if (col_idx == 0) {
-  //     loc_grad_q[gqAddrBase + j] = (psum_t)sum;
-  //   }
-  // }
-
-  // compute jacobian for all links in linkmat:
-
-  // write out link:
-  //
-  for (int16_t i = 0; i < store_n_links; i++) {
-    const int batchAddrs = batch * store_n_links;
-    float3 g_position = *(float3 *)&grad_nlinks_pos[batchAddrs * 3 + i * 3];
-    float4 g_orientation_t =
-        *(float4 *)&grad_nlinks_quat[batchAddrs * 4 + i * 4];
-    // sparisty check here:
-    if (enable_sparsity_opt) {
-      if (g_position.x == 0 && g_position.y == 0 && g_position.z == 0 &&
-          g_orientation_t.y == 0 && g_orientation_t.z == 0 &&
-          g_orientation_t.w == 0) {
-        continue;
-      }
-    }
-    float3 g_orientation =
-        make_float3(g_orientation_t.y, g_orientation_t.z, g_orientation_t.w);
-
-    const int16_t l_map = storeLinkMap[i];
-    
-
-    float l_pos[3];
-    const int outAddrStart = matAddrBase + l_map * M * M;
-    l_pos[0] = cumul_mat[outAddrStart + 3]; // read position of stored link
-    l_pos[1] = cumul_mat[outAddrStart + M + 3];
-    l_pos[2] = cumul_mat[outAddrStart + M * 2 + 3];
-
-    const int16_t max_lmap = nlinks - 1;
-    
-    const int16_t joints_per_thread = (max_lmap + 15) / 16;
-    //for (int16_t k = joints_per_thread; k >= 0; k--) 
-    for (int16_t k=0; k < joints_per_thread; k++)
+    // kin_fused_backward_kernel3 uses 16 threads per batch, instead of 4 per batch
+    // as in kin_fused_backward_kernel2.
+    template<typename scalar_t, typename psum_t, bool use_global_cumul,
+             bool enable_sparsity_opt, int16_t MAX_JOINTS, bool PARALLEL_WRITE>
+    __global__ void kin_fused_backward_kernel3(
+      scalar_t *grad_out_link_q,       // batchSize * njoints
+      const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
+      const scalar_t *grad_nlinks_quat,
+      const scalar_t *grad_spheres,    // batchSize * nspheres * 4
+      const scalar_t *global_cumul_mat,
+      const scalar_t *q,               // batchSize * njoints
+      const scalar_t *fixedTransform,  // nlinks * 16
+      const scalar_t *robotSpheres,    // batchSize * nspheres * 4
+      const int8_t *jointMapType,      // nlinks
+      const int16_t *jointMap,         // nlinks
+      const int16_t *linkMap,          // nlinks
+      const int16_t *storeLinkMap,     // store_n_links
+      const int16_t *linkSphereMap,    // nspheres
+      const int16_t *linkChainMap,     // nlinks*nlinks
+      const int batchSize, const int nspheres, const int nlinks,
+      const int njoints, const int store_n_links)
     {
-    
-      //int16_t j = elem_idx * joints_per_thread + k;
-      int16_t j = elem_idx + k * 16;
-      //int16_t j = k * M + elem_idx;
-      if (j > max_lmap || j < 0)
-        continue;
-      // This can be spread across threads as they are not sequential?
-      if (linkChainMap[l_map * nlinks + j] == 0.0) {
-        continue;
-      }
-      int16_t j_idx = jointMap[j];
-      int j_type = jointMapType[j];
+      extern __shared__ float cumul_mat[];
 
-      int8_t axis_sign = 1;
-      if (j_type != FIXED)
+      int t           = blockDim.x * blockIdx.x + threadIdx.x;
+      const int batch = t / 16;
+      unsigned  mask  = __ballot_sync(0xffffffff, batch < batchSize);
+
+      if (batch >= batchSize)
+        return;
+
+      // Each thread computes one element of the cumul_mat.
+      // first 4 threads compute a row of the output;
+      const int elem_idx    = threadIdx.x % 16;
+      const int col_idx     = elem_idx % 4;
+      const int local_batch = threadIdx.x / 16;
+      const int matAddrBase = local_batch * nlinks * M * M;
+
+      if (use_global_cumul)
       {
-        update_joint_type_direction(j_type, axis_sign);
+        for (int l = 0; l < nlinks; l++)
+        {
+          int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+          cumul_mat[outAddrStart + elem_idx] =
+            global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
+        }
       }
-      
-      // get rotation vector:
-      if (j_type == Z_ROT) {
-        z_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
-                       g_position, g_orientation, psum_grad[j_idx], axis_sign);
-      } else if (j_type >= X_PRISM & j_type <= Z_PRISM) {
-        xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
-                                       g_position, psum_grad[j_idx], j_type, axis_sign);
-      } else if (j_type == X_ROT) {
-        x_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
-                       g_position, g_orientation, psum_grad[j_idx], axis_sign);
-      } else if (j_type == Y_ROT) {
-        y_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
-                       g_position, g_orientation, psum_grad[j_idx], axis_sign);
-      }
-    }
-  }
-  if (PARALLEL_WRITE) {
-
-// accumulate the partial sums across the 16 threads
-#pragma unroll
-    for (int16_t j = 0; j < njoints; j++) {
-      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 1);
-      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 2);
-      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 4);
-      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 8);
-      // thread 0: psum_grad[j] will have the sum across 16 threads
-      // write out using only thread 0
-    }
-
-    const int16_t joints_per_thread = (njoints + 15) / 16;
-
-#pragma unroll
-    for (int16_t j = 0; j < joints_per_thread; j++) {
-      //const int16_t j_idx = elem_idx * joints_per_thread + j;
-      const int16_t j_idx = elem_idx + j * 16;
-      if (j_idx >= njoints) {
-        break;
-      }
-      grad_out_link_q[batch * njoints + j_idx] =
-          psum_grad[j_idx]; // write the sum to memory
-    }
-  } else {
-#pragma unroll
-    for (int16_t j = 0; j < njoints; j++) {
-      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 1);
-      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 2);
-      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 4);
-      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 8);
-      // thread 0: psum_grad[j] will have the sum across 16 threads
-      // write out using only thread 0
-    }
-
-    if (elem_idx > 0) {
-      return;
-    }
-
-#pragma unroll
-    for (int16_t j = 0; j < njoints; j++) {
+      else
       {
-        grad_out_link_q[batch * njoints + j] =
-            psum_grad[j]; // write the sum to memory
+        cumul_mat[matAddrBase + elem_idx] = fixedTransform[elem_idx];
+
+        for (int l = 1; l < nlinks; l++)                // TODO: add base link transform
+        {
+          float JM[M];                                  // store one row locally for mat-mul
+          int   ftAddrStart  = l * M * M;               // + (t % M) * M;
+          int   inAddrStart  = matAddrBase + linkMap[l] * M * M;
+          int   outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+
+          int j_type = jointMapType[l];
+
+
+          if (j_type == FIXED)
+          {
+            fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
+          }
+          else
+          {
+            float angle = q[batch * njoints + jointMap[l]];
+
+            update_axis_direction(angle, j_type);
+
+            if (j_type <= Z_PRISM)
+            {
+              prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
+            }
+            else if (j_type == X_ROT)
+            {
+              xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+            }
+            else if (j_type == Y_ROT)
+            {
+              yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+            }
+            else if (j_type == Z_ROT)
+            {
+              zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+            }
+            else
+            {
+              assert(j_type >= FIXED && j_type <= Z_ROT);
+            }
+          }
+
+          // fetch one row of cumul_mat, multiply with a column, which is in JM
+          cumul_mat[outAddrStart + elem_idx] =
+            dot(*(float4 *)&cumul_mat[inAddrStart + ((elem_idx / 4) * M)],
+                *(float4 *)JM);
+        }
       }
+
+      // thread-local partial sum accumulators
+      // We would like to keep these partial sums in register file and avoid memory
+      // accesses
+      psum_t psum_grad[MAX_JOINTS]; // MAX_JOINTS
+      // we are allocating a lot larger array. So, we will be initilizing just the
+      // portion we need.
+#pragma unroll
+
+      for (int i = 0; i < njoints; i++)
+      {
+        psum_grad[i] = 0.0;
+      }
+
+      // read cumul mat index to run for this thread:
+      int read_cumul_idx = -1;
+
+      const int spheres_perthread = (nspheres + 15) / 16;
+
+      for (int i = 0; i < spheres_perthread; i++)
+      {
+        // const int sph_idx = elem_idx * spheres_perthread + i;
+        const int sph_idx = elem_idx + i * 16;
+
+        if (sph_idx >= nspheres)
+        {
+          break;
+        }
+        const int sphAddrs          = sph_idx * 4;
+        const int batchAddrs        = batch * nspheres * 4;
+        float4    loc_grad_sphere_t = *(float4 *)&grad_spheres[batchAddrs + sphAddrs];
+
+        // Sparsity-based optimization: Skip zero computation
+        if (enable_sparsity_opt)
+        {
+          if ((loc_grad_sphere_t.x == 0) && (loc_grad_sphere_t.y == 0) &&
+              (loc_grad_sphere_t.z == 0))
+          {
+            continue;
+          }
+        }
+        float3 loc_grad_sphere = make_float3(
+          loc_grad_sphere_t.x, loc_grad_sphere_t.y, loc_grad_sphere_t.z);
+
+        // read cumul idx:
+        read_cumul_idx = linkSphereMap[sph_idx];
+        float spheres_mem[4];
+        transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
+                         &robotSpheres[sphAddrs], &spheres_mem[0]);
+
+        // assuming this sphere only depends on links lower than this index
+        // This could be relaxed by making read_cumul_idx = number of links.
+        // const int16_t loop_max = read_cumul_idx;
+        const int16_t loop_max = nlinks - 1;
+
+        for (int j = loop_max; j > -1; j--)
+        {
+          if (linkChainMap[read_cumul_idx * nlinks + j] == 0.0)
+          {
+            continue;
+          }
+          int8_t axis_sign = 1;
+
+          int j_type = jointMapType[j];
+
+          if (j_type != FIXED)
+          {
+            update_joint_type_direction(j_type, axis_sign);
+          }
+
+          if (j_type == Z_ROT)
+          {
+            float result = 0.0;
+            z_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                       &spheres_mem[0], loc_grad_sphere, result, axis_sign);
+            psum_grad[jointMap[j]] += (psum_t)result;
+          }
+          else if ((j_type >= X_PRISM) && (j_type <= Z_PRISM))
+          {
+            float result = 0.0;
+            xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                           loc_grad_sphere, result, j_type, axis_sign);
+            psum_grad[jointMap[j]] += (psum_t)result;
+          }
+          else if (j_type == X_ROT)
+          {
+            float result = 0.0;
+            x_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                       &spheres_mem[0], loc_grad_sphere, result, axis_sign);
+            psum_grad[jointMap[j]] += (psum_t)result;
+          }
+          else if (j_type == Y_ROT)
+          {
+            float result = 0.0;
+            y_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                       &spheres_mem[0], loc_grad_sphere, result, axis_sign);
+            psum_grad[jointMap[j]] += (psum_t)result;
+          }
+        }
+      }
+
+
+      // Instead of accumulating the sphere_grad and link_grad separately, we will
+      // accumulate them together once below.
+      //
+      // // accumulate across 4 threads using shuffle operation
+      // for(int j=0; j<njoints; j++)
+      // {
+      //   psum_t sum = psum_grad[j];
+      //   sum += __shfl_down_sync(mask, sum, 1);
+      //   sum += __shfl_down_sync(mask, sum, 2);
+
+      //   // thread 0 (col_idx==0) has the accumulated psum_grad[j] in sum
+      //   if (col_idx == 0) {
+      //     loc_grad_q[gqAddrBase + j] = (psum_t)sum;
+      //   }
+      // }
+
+      // compute jacobian for all links in linkmat:
+
+      // write out link:
+      //
+      for (int16_t i = 0; i < store_n_links; i++)
+      {
+        const int batchAddrs      = batch * store_n_links;
+        float3    g_position      = *(float3 *)&grad_nlinks_pos[batchAddrs * 3 + i * 3];
+        float4    g_orientation_t =
+          *(float4 *)&grad_nlinks_quat[batchAddrs * 4 + i * 4];
+
+        // sparisty check here:
+        if (enable_sparsity_opt)
+        {
+          if ((g_position.x == 0) && (g_position.y == 0) && (g_position.z == 0) &&
+              (g_orientation_t.y == 0) && (g_orientation_t.z == 0) &&
+              (g_orientation_t.w == 0))
+          {
+            continue;
+          }
+        }
+        float3 g_orientation =
+          make_float3(g_orientation_t.y, g_orientation_t.z, g_orientation_t.w);
+
+        const int16_t l_map = storeLinkMap[i];
+
+
+        float l_pos[3];
+        const int outAddrStart = matAddrBase + l_map * M * M;
+        l_pos[0] = cumul_mat[outAddrStart + 3]; // read position of stored link
+        l_pos[1] = cumul_mat[outAddrStart + M + 3];
+        l_pos[2] = cumul_mat[outAddrStart + M * 2 + 3];
+
+        const int16_t max_lmap = nlinks - 1;
+
+        const int16_t joints_per_thread = (max_lmap + 15) / 16;
+
+        // for (int16_t k = joints_per_thread; k >= 0; k--)
+        for (int16_t k = 0; k < joints_per_thread; k++)
+        {
+          // int16_t j = elem_idx * joints_per_thread + k;
+          int16_t j = elem_idx + k * 16;
+
+          // int16_t j = k * M + elem_idx;
+          if ((j > max_lmap) || (j < 0))
+            continue;
+
+          // This can be spread across threads as they are not sequential?
+          if (linkChainMap[l_map * nlinks + j] == 0.0)
+          {
+            continue;
+          }
+          int16_t j_idx  = jointMap[j];
+          int     j_type = jointMapType[j];
+
+          int8_t axis_sign = 1;
+
+          if (j_type != FIXED)
+          {
+            update_joint_type_direction(j_type, axis_sign);
+          }
+
+          // get rotation vector:
+          if (j_type == Z_ROT)
+          {
+            z_rot_backward(&cumul_mat[matAddrBase + (j) * M * M], &l_pos[0],
+                           g_position, g_orientation, psum_grad[j_idx], axis_sign);
+          }
+          else if (j_type >= X_PRISM & j_type <= Z_PRISM)
+          {
+            xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                           g_position, psum_grad[j_idx], j_type, axis_sign);
+          }
+          else if (j_type == X_ROT)
+          {
+            x_rot_backward(&cumul_mat[matAddrBase + (j) * M * M], &l_pos[0],
+                           g_position, g_orientation, psum_grad[j_idx], axis_sign);
+          }
+          else if (j_type == Y_ROT)
+          {
+            y_rot_backward(&cumul_mat[matAddrBase + (j) * M * M], &l_pos[0],
+                           g_position, g_orientation, psum_grad[j_idx], axis_sign);
+          }
+        }
+      }
+
+      if (PARALLEL_WRITE)
+      {
+        // accumulate the partial sums across the 16 threads
+#pragma unroll
+
+        for (int16_t j = 0; j < njoints; j++)
+        {
+          psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 1);
+          psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 2);
+          psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 4);
+          psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 8);
+
+          // thread 0: psum_grad[j] will have the sum across 16 threads
+          // write out using only thread 0
+        }
+
+        const int16_t joints_per_thread = (njoints + 15) / 16;
+
+#pragma unroll
+
+        for (int16_t j = 0; j < joints_per_thread; j++)
+        {
+          // const int16_t j_idx = elem_idx * joints_per_thread + j;
+          const int16_t j_idx = elem_idx + j * 16;
+
+          if (j_idx >= njoints)
+          {
+            break;
+          }
+          grad_out_link_q[batch * njoints + j_idx] =
+            psum_grad[j_idx]; // write the sum to memory
+        }
+      }
+      else
+      {
+#pragma unroll
+
+        for (int16_t j = 0; j < njoints; j++)
+        {
+          psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 1);
+          psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 2);
+          psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 4);
+          psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 8);
+
+          // thread 0: psum_grad[j] will have the sum across 16 threads
+          // write out using only thread 0
+        }
+
+        if (elem_idx > 0)
+        {
+          return;
+        }
+
+#pragma unroll
+
+        for (int16_t j = 0; j < njoints; j++)
+        {
+          {
+            grad_out_link_q[batch * njoints + j] =
+              psum_grad[j]; // write the sum to memory
+          }
+        }
+      }
+
+      // accumulate the partial sums across the 16 threads
     }
-  }
-  // accumulate the partial sums across the 16 threads
-}
 
-template <typename scalar_t>
-__global__ void mat_to_quat_kernel(scalar_t *out_quat,
-                                   const scalar_t *in_rot_mat,
-                                   const int batch_size) {
-  // Only works for float32
-  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (batch_idx >= batch_size) {
-    return;
-  }
-  float q[4] = {0.0}; // initialize array
+    template<typename scalar_t>
+    __global__ void mat_to_quat_kernel(scalar_t       *out_quat,
+                                       const scalar_t *in_rot_mat,
+                                       const int       batch_size)
+    {
+      // Only works for float32
+      const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  float rot[9];
+      if (batch_idx >= batch_size)
+      {
+        return;
+      }
+      float q[4] = { 0.0 }; // initialize array
 
-  // read rot
-  *(float3 *)&rot[0] = *(float3 *)&in_rot_mat[batch_idx * 9];
-  *(float3 *)&rot[3] = *(float3 *)&in_rot_mat[batch_idx * 9 + 3];
-  *(float3 *)&rot[6] = *(float3 *)&in_rot_mat[batch_idx * 9 + 6];
+      float rot[9];
 
-  rot_to_quat(&rot[0], &q[0]);
+      // read rot
+      *(float3 *)&rot[0] = *(float3 *)&in_rot_mat[batch_idx * 9];
+      *(float3 *)&rot[3] = *(float3 *)&in_rot_mat[batch_idx * 9 + 3];
+      *(float3 *)&rot[6] = *(float3 *)&in_rot_mat[batch_idx * 9 + 6];
 
-  // write quaternion:
-  *(float4 *)&out_quat[batch_idx * 4] = *(float4 *)&q[0];
-}
-} // namespace Kinematics
-} // namespace Curobo
+      rot_to_quat(&rot[0], &q[0]);
 
-std::vector<torch::Tensor> kin_fused_forward(
-    torch::Tensor link_pos, torch::Tensor link_quat,
-    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
-    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
-    const torch::Tensor robot_spheres, const torch::Tensor link_map,
-    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
-    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-    const int batch_size, const int n_spheres,
-    const bool use_global_cumul = false) {
+      // write quaternion:
+      *(float4 *)&out_quat[batch_idx * 4] = *(float4 *)&q[0];
+    }
+  } // namespace Kinematics
+}   // namespace Curobo
+
+std::vector<torch::Tensor>kin_fused_forward(
+  torch::Tensor link_pos, torch::Tensor link_quat,
+  torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+  const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres, const torch::Tensor link_map,
+  const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+  const int batch_size, const int n_spheres,
+  const bool use_global_cumul = false)
+{
   using namespace Curobo::Kinematics;
-  const int n_joints = joint_vec.size(0) / batch_size;
-  const int n_links = link_map.size(0);
+  const int n_joints      = joint_vec.size(0) / batch_size;
+  const int n_links       = link_map.size(0);
   const int store_n_links = link_pos.size(1);
   assert(joint_map.dtype() == torch::kInt16);
   assert(joint_map_type.dtype() == torch::kInt8);
@@ -1043,63 +1212,70 @@ std::vector<torch::Tensor> kin_fused_forward(
 
   int batches_per_block = (int)((MAX_TOTAL_LINKS / n_links));
 
-  if (batches_per_block == 0) {
+  if (batches_per_block == 0)
+  {
     batches_per_block = 1;
   }
 
-  if (batches_per_block > MAX_BATCH_PER_BLOCK) {
+  if (batches_per_block > MAX_BATCH_PER_BLOCK)
+  {
     batches_per_block = MAX_BATCH_PER_BLOCK;
   }
 
-  if (batches_per_block * M > 1024) {
+  if (batches_per_block * M > 1024)
+  {
     batches_per_block = 1024 / (M);
   }
+
   // batches_per_block = 1;
   const int threadsPerBlock = batches_per_block * M;
-  const int blocksPerGrid =
-      (batch_size * M + threadsPerBlock - 1) / threadsPerBlock;
+  const int blocksPerGrid   =
+    (batch_size * M + threadsPerBlock - 1) / threadsPerBlock;
   const int sharedMemSize = batches_per_block * n_links * M * M * sizeof(float);
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (use_global_cumul) {
 
+  if (use_global_cumul)
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        link_pos.scalar_type(), "kin_fused_forward", ([&] {
-          kin_fused_warp_kernel2<scalar_t, true>
-              <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
-                  link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
-                  batch_robot_spheres.data_ptr<scalar_t>(),
-                  global_cumul_mat.data_ptr<scalar_t>(),
-                  joint_vec.data_ptr<scalar_t>(),
-                  fixed_transform.data_ptr<scalar_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  joint_map_type.data_ptr<int8_t>(),
-                  joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
-                  store_link_map.data_ptr<int16_t>(),
-                  link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
-                  n_links, n_joints, store_n_links);
-        }));
-  } else {
+      link_pos.scalar_type(), "kin_fused_forward", ([&] {
+      kin_fused_warp_kernel2<scalar_t, true>
+        << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+        link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
+        batch_robot_spheres.data_ptr<scalar_t>(),
+        global_cumul_mat.data_ptr<scalar_t>(),
+        joint_vec.data_ptr<scalar_t>(),
+        fixed_transform.data_ptr<scalar_t>(),
+        robot_spheres.data_ptr<scalar_t>(),
+        joint_map_type.data_ptr<int8_t>(),
+        joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+        store_link_map.data_ptr<int16_t>(),
+        link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
+        n_links, n_joints, store_n_links);
+    }));
+  }
+  else
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        link_pos.scalar_type(), "kin_fused_forward", ([&] {
-          kin_fused_warp_kernel2<scalar_t, false>
-              <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
-                  link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
-                  batch_robot_spheres.data_ptr<scalar_t>(),
-                  global_cumul_mat.data_ptr<scalar_t>(),
-                  joint_vec.data_ptr<scalar_t>(),
-                  fixed_transform.data_ptr<scalar_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  joint_map_type.data_ptr<int8_t>(),
-                  joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
-                  store_link_map.data_ptr<int16_t>(),
-                  link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
-                  n_links, n_joints, store_n_links);
-        }));
+      link_pos.scalar_type(), "kin_fused_forward", ([&] {
+      kin_fused_warp_kernel2<scalar_t, false>
+        << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+        link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
+        batch_robot_spheres.data_ptr<scalar_t>(),
+        global_cumul_mat.data_ptr<scalar_t>(),
+        joint_vec.data_ptr<scalar_t>(),
+        fixed_transform.data_ptr<scalar_t>(),
+        robot_spheres.data_ptr<scalar_t>(),
+        joint_map_type.data_ptr<int8_t>(),
+        joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+        store_link_map.data_ptr<int16_t>(),
+        link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
+        n_links, n_joints, store_n_links);
+    }));
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {link_pos, link_quat, batch_robot_spheres, global_cumul_mat};
+  return { link_pos, link_quat, batch_robot_spheres, global_cumul_mat };
 }
 
 /////////////////////////////////////////////
@@ -1108,48 +1284,55 @@ std::vector<torch::Tensor> kin_fused_forward(
 // This version is 30-100% faster compared to
 // kin_fused_backward_4t.
 /////////////////////////////////////////////
-std::vector<torch::Tensor> kin_fused_backward_16t(
-    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
-    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
-    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
-    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
-    const torch::Tensor link_map, const torch::Tensor joint_map,
-    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
-    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
-    const bool use_global_cumul = false) {
+std::vector<torch::Tensor>kin_fused_backward_16t(
+  torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+  const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+  const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+  const torch::Tensor link_map, const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+  const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+  const bool use_global_cumul = false)
+{
   using namespace Curobo::Kinematics;
 
-  const int n_joints = joint_vec.size(0) / batch_size;
-  const int n_links = link_map.size(0);
+  const int n_joints      = joint_vec.size(0) / batch_size;
+  const int n_links       = link_map.size(0);
   const int store_n_links = store_link_map.size(0);
 
   // assert(n_links < 128);
   assert(n_joints < 128); // for larger num. of joints, change kernel3's
                           // MAX_JOINTS template value.
   assert(n_links < MAX_TOTAL_LINKS);
+
   // We need 16 threads per batch
   // Find the maximum number of batches we can use per block:
   //
   int batches_per_block = (int)((MAX_TOTAL_LINKS / n_links));
-  if (batches_per_block == 0) {
+
+  if (batches_per_block == 0)
+  {
     batches_per_block = 1;
   }
 
   // To optimize for better occupancy, we might limit to MAX_BATCH_PER_BLOCK
-  if (batches_per_block > MAX_BW_BATCH_PER_BLOCK) {
+  if (batches_per_block > MAX_BW_BATCH_PER_BLOCK)
+  {
     batches_per_block = MAX_BW_BATCH_PER_BLOCK;
   }
 
   // we cannot have more than 1024 threads:
-  if (batches_per_block * M * M > 1024) {
+  if (batches_per_block * M * M > 1024)
+  {
     batches_per_block = 1024 / (M * M);
   }
 
   const int threadsPerBlock = batches_per_block * M * M;
 
   const int blocksPerGrid =
-      (batch_size * M * M + threadsPerBlock - 1) / threadsPerBlock;
+    (batch_size * M * M + threadsPerBlock - 1) / threadsPerBlock;
+
   // assert to make sure n_joints, n_links < 128 to avoid overflow
   // printf("threadsPerBlock: %d, blocksPerGRid: %d\n", threadsPerBlock,
   // blocksPerGrid);
@@ -1158,103 +1341,111 @@ std::vector<torch::Tensor> kin_fused_backward_16t(
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   assert(sparsity_opt);
-  if (use_global_cumul) {
-    if (n_joints < 16) {
-      AT_DISPATCH_FLOATING_TYPES(
-          grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-            kin_fused_backward_kernel3<scalar_t, float, true, true, 16, true>
-                <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
-                    grad_out.data_ptr<scalar_t>(),
-                    grad_nlinks_pos.data_ptr<scalar_t>(),
-                    grad_nlinks_quat.data_ptr<scalar_t>(),
-                    grad_spheres.data_ptr<scalar_t>(),
-                    global_cumul_mat.data_ptr<scalar_t>(),
-                    joint_vec.data_ptr<scalar_t>(),
-                    fixed_transform.data_ptr<scalar_t>(),
-                    robot_spheres.data_ptr<scalar_t>(),
-                    joint_map_type.data_ptr<int8_t>(),
-                    joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
-                    store_link_map.data_ptr<int16_t>(),
-                    link_sphere_map.data_ptr<int16_t>(),
-                    link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
-                    n_links, n_joints, store_n_links);
-          }));
-    } else if (n_joints < 64) {
 
+  if (use_global_cumul)
+  {
+    if (n_joints < 16)
+    {
       AT_DISPATCH_FLOATING_TYPES(
-          grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-            kin_fused_backward_kernel3<scalar_t, float, true, true, 64, true>
-                <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
-                    grad_out.data_ptr<scalar_t>(),
-                    grad_nlinks_pos.data_ptr<scalar_t>(),
-                    grad_nlinks_quat.data_ptr<scalar_t>(),
-                    grad_spheres.data_ptr<scalar_t>(),
-                    global_cumul_mat.data_ptr<scalar_t>(),
-                    joint_vec.data_ptr<scalar_t>(),
-                    fixed_transform.data_ptr<scalar_t>(),
-                    robot_spheres.data_ptr<scalar_t>(),
-                    joint_map_type.data_ptr<int8_t>(),
-                    joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
-                    store_link_map.data_ptr<int16_t>(),
-                    link_sphere_map.data_ptr<int16_t>(),
-                    link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
-                    n_links, n_joints, store_n_links);
-          }));
-    } else {
-      AT_DISPATCH_FLOATING_TYPES(
-          grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-            kin_fused_backward_kernel3<scalar_t, float, true, true, 128, true>
-                <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
-                    grad_out.data_ptr<scalar_t>(),
-                    grad_nlinks_pos.data_ptr<scalar_t>(),
-                    grad_nlinks_quat.data_ptr<scalar_t>(),
-                    grad_spheres.data_ptr<scalar_t>(),
-                    global_cumul_mat.data_ptr<scalar_t>(),
-                    joint_vec.data_ptr<scalar_t>(),
-                    fixed_transform.data_ptr<scalar_t>(),
-                    robot_spheres.data_ptr<scalar_t>(),
-                    joint_map_type.data_ptr<int8_t>(),
-                    joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
-                    store_link_map.data_ptr<int16_t>(),
-                    link_sphere_map.data_ptr<int16_t>(),
-                    link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
-                    n_links, n_joints, store_n_links);
-          }));
+        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+        kin_fused_backward_kernel3<scalar_t, float, true, true, 16, true>
+          << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+          grad_out.data_ptr<scalar_t>(),
+          grad_nlinks_pos.data_ptr<scalar_t>(),
+          grad_nlinks_quat.data_ptr<scalar_t>(),
+          grad_spheres.data_ptr<scalar_t>(),
+          global_cumul_mat.data_ptr<scalar_t>(),
+          joint_vec.data_ptr<scalar_t>(),
+          fixed_transform.data_ptr<scalar_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          joint_map_type.data_ptr<int8_t>(),
+          joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+          store_link_map.data_ptr<int16_t>(),
+          link_sphere_map.data_ptr<int16_t>(),
+          link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+          n_links, n_joints, store_n_links);
+      }));
+    }
+    else if (n_joints < 64)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+        kin_fused_backward_kernel3<scalar_t, float, true, true, 64, true>
+          << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+          grad_out.data_ptr<scalar_t>(),
+          grad_nlinks_pos.data_ptr<scalar_t>(),
+          grad_nlinks_quat.data_ptr<scalar_t>(),
+          grad_spheres.data_ptr<scalar_t>(),
+          global_cumul_mat.data_ptr<scalar_t>(),
+          joint_vec.data_ptr<scalar_t>(),
+          fixed_transform.data_ptr<scalar_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          joint_map_type.data_ptr<int8_t>(),
+          joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+          store_link_map.data_ptr<int16_t>(),
+          link_sphere_map.data_ptr<int16_t>(),
+          link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+          n_links, n_joints, store_n_links);
+      }));
+    }
+    else
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+        kin_fused_backward_kernel3<scalar_t, float, true, true, 128, true>
+          << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+          grad_out.data_ptr<scalar_t>(),
+          grad_nlinks_pos.data_ptr<scalar_t>(),
+          grad_nlinks_quat.data_ptr<scalar_t>(),
+          grad_spheres.data_ptr<scalar_t>(),
+          global_cumul_mat.data_ptr<scalar_t>(),
+          joint_vec.data_ptr<scalar_t>(),
+          fixed_transform.data_ptr<scalar_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          joint_map_type.data_ptr<int8_t>(),
+          joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+          store_link_map.data_ptr<int16_t>(),
+          link_sphere_map.data_ptr<int16_t>(),
+          link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+          n_links, n_joints, store_n_links);
+      }));
     }
-    //
 
-  } else {
+    //
+  }
+  else
+  {
     //
     AT_DISPATCH_FLOATING_TYPES(
-        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-          kin_fused_backward_kernel3<scalar_t, float, false, true, 128, true>
-              <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
-                  grad_out.data_ptr<scalar_t>(),
-                  grad_nlinks_pos.data_ptr<scalar_t>(),
-                  grad_nlinks_quat.data_ptr<scalar_t>(),
-                  grad_spheres.data_ptr<scalar_t>(),
-                  global_cumul_mat.data_ptr<scalar_t>(),
-                  joint_vec.data_ptr<scalar_t>(),
-                  fixed_transform.data_ptr<scalar_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  joint_map_type.data_ptr<int8_t>(),
-                  joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
-                  store_link_map.data_ptr<int16_t>(),
-                  link_sphere_map.data_ptr<int16_t>(),
-                  link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
-                  n_links, n_joints, store_n_links);
-        }));
+      grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+      kin_fused_backward_kernel3<scalar_t, float, false, true, 128, true>
+        << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+        grad_out.data_ptr<scalar_t>(),
+        grad_nlinks_pos.data_ptr<scalar_t>(),
+        grad_nlinks_quat.data_ptr<scalar_t>(),
+        grad_spheres.data_ptr<scalar_t>(),
+        global_cumul_mat.data_ptr<scalar_t>(),
+        joint_vec.data_ptr<scalar_t>(),
+        fixed_transform.data_ptr<scalar_t>(),
+        robot_spheres.data_ptr<scalar_t>(),
+        joint_map_type.data_ptr<int8_t>(),
+        joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+        store_link_map.data_ptr<int16_t>(),
+        link_sphere_map.data_ptr<int16_t>(),
+        link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+        n_links, n_joints, store_n_links);
+    }));
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {grad_out};
+  return { grad_out };
 }
 
 std::vector<torch::Tensor>
-matrix_to_quaternion(torch::Tensor out_quat,
+matrix_to_quaternion(torch::Tensor       out_quat,
                      const torch::Tensor in_rot // batch_size, 3
-) {
-
+                     )
+{
   using namespace Curobo::Kinematics;
 
   // we compute the warp threads based on number of boxes:
@@ -1263,7 +1454,9 @@ matrix_to_quaternion(torch::Tensor out_quat,
   const int batch_size = in_rot.size(0);
 
   int threadsPerBlock = batch_size;
-  if (batch_size > 512) {
+
+  if (batch_size > 512)
+  {
     threadsPerBlock = 512;
   }
 
@@ -1274,12 +1467,12 @@ matrix_to_quaternion(torch::Tensor out_quat,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      in_rot.scalar_type(), "matrix_to_quaternion", ([&] {
-        mat_to_quat_kernel<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_quat.data_ptr<scalar_t>(), in_rot.data_ptr<scalar_t>(),
-                batch_size);
-      }));
+    in_rot.scalar_type(), "matrix_to_quaternion", ([&] {
+    mat_to_quat_kernel<scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+      out_quat.data_ptr<scalar_t>(), in_rot.data_ptr<scalar_t>(),
+      batch_size);
+  }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return {out_quat};
-}
\ No newline at end of file
+  return { out_quat };
+}
diff --git a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
index c158d99..b9bb8ea 100644
--- a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
@@ -15,45 +15,68 @@
 #include <c10/cuda/CUDAGuard.h>
 
 // CUDA forward declarations
-std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
-                                       torch::Tensor rho_buffer,
-                                       torch::Tensor sum, const int batch_size,
-                                       const int v_dim, const int m);
+std::vector<torch::Tensor>reduce_cuda(torch::Tensor vec,
+                                      torch::Tensor vec2,
+                                      torch::Tensor rho_buffer,
+                                      torch::Tensor sum,
+                                      const int     batch_size,
+                                      const int     v_dim,
+                                      const int     m);
 
 std::vector<torch::Tensor>
-lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
-                torch::Tensor y_buffer, torch::Tensor s_buffer,
-                torch::Tensor grad_q, const float epsilon, const int batch_size,
-                const int m, const int v_dim);
+lbfgs_step_cuda(torch::Tensor step_vec,
+                torch::Tensor rho_buffer,
+                torch::Tensor y_buffer,
+                torch::Tensor s_buffer,
+                torch::Tensor grad_q,
+                const float   epsilon,
+                const int     batch_size,
+                const int     m,
+                const int     v_dim);
 
 std::vector<torch::Tensor>
-lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
-                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
-                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
-                  const int m, const int v_dim);
+lbfgs_update_cuda(torch::Tensor rho_buffer,
+                  torch::Tensor y_buffer,
+                  torch::Tensor s_buffer,
+                  torch::Tensor q,
+                  torch::Tensor grad_q,
+                  torch::Tensor x_0,
+                  torch::Tensor grad_0,
+                  const int     batch_size,
+                  const int     m,
+                  const int     v_dim);
 
 std::vector<torch::Tensor>
-lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
-                torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
-                torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
-                const float epsilon, const int batch_size, const int m,
-                const int v_dim, const bool stable_mode);
+lbfgs_cuda_fuse(torch::Tensor step_vec,
+                torch::Tensor rho_buffer,
+                torch::Tensor y_buffer,
+                torch::Tensor s_buffer,
+                torch::Tensor q,
+                torch::Tensor grad_q,
+                torch::Tensor x_0,
+                torch::Tensor grad_0,
+                const float   epsilon,
+                const int     batch_size,
+                const int     m,
+                const int     v_dim,
+                const bool    stable_mode);
+
 // C++ interface
 
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
 
 std::vector<torch::Tensor>
 lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
                 torch::Tensor y_buffer, torch::Tensor s_buffer,
                 torch::Tensor grad_q, const float epsilon, const int batch_size,
-                const int m, const int v_dim) {
-
+                const int m, const int v_dim)
+{
   CHECK_INPUT(step_vec);
   CHECK_INPUT(rho_buffer);
   CHECK_INPUT(y_buffer);
@@ -69,7 +92,8 @@ std::vector<torch::Tensor>
 lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
                   torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
                   torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
-                  const int m, const int v_dim) {
+                  const int m, const int v_dim)
+{
   CHECK_INPUT(rho_buffer);
   CHECK_INPUT(y_buffer);
   CHECK_INPUT(s_buffer);
@@ -86,7 +110,8 @@ lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
 std::vector<torch::Tensor>
 reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
                  torch::Tensor rho_buffer, torch::Tensor sum,
-                 const int batch_size, const int v_dim, const int m) {
+                 const int batch_size, const int v_dim, const int m)
+{
   const at::cuda::OptionalCUDAGuard guard(sum.device());
 
   return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
@@ -97,7 +122,8 @@ lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
            torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
            torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
            const float epsilon, const int batch_size, const int m,
-           const int v_dim, const bool stable_mode) {
+           const int v_dim, const bool stable_mode)
+{
   CHECK_INPUT(step_vec);
   CHECK_INPUT(rho_buffer);
   CHECK_INPUT(y_buffer);
@@ -113,9 +139,10 @@ lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
                          stable_mode);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)");
-  m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)");
-  m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)");
-  m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug");
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("step",         &lbfgs_step_call,   "L-BFGS step (CUDA)");
+  m.def("update",       &lbfgs_update_call, "L-BFGS Update (CUDA)");
+  m.def("forward",      &lbfgs_call,        "L-BFGS Update + Step (CUDA)");
+  m.def("debug_reduce", &reduce_cuda_call,  "L-BFGS Debug");
 }
diff --git a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
index 251ed9d..1bad714 100644
--- a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
@@ -39,696 +39,830 @@
 
 #define FULL_MASK 0xffffffff
 
-namespace Curobo {
-
-namespace Optimization {
-
-template <typename scalar_t>
-__device__ __forceinline__ void
-scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out,
-                   const int v_dim) {
-
-  for (int i = 0; i < v_dim; i++) {
-    out[i] = a * b[i];
-  }
-}
-
-template <typename scalar_t>
-__device__ __forceinline__ void
-m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out,
-                     const int v_dim, const int m) {
-  for (int j = 0; j < m; j++) {
-    for (int i = 0; i < v_dim; i++) {
-      out[j * v_dim + i] = a[j] * b[j * v_dim + i];
+namespace Curobo
+{
+  namespace Optimization
+  {
+    template<typename scalar_t>
+    __device__ __forceinline__ void
+    scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out,
+                       const int v_dim)
+    {
+      for (int i = 0; i < v_dim; i++)
+      {
+        out[i] = a * b[i];
+      }
     }
-  }
-}
 
-template <typename scalar_t>
-__device__ __forceinline__ void vec_vec_dot(const scalar_t *a,
-                                            const scalar_t *b, scalar_t &out,
-                                            const int v_dim) {
-
-  for (int i = 0; i < v_dim; i++) {
-    out += a[i] * b[i];
-  }
-}
-
-template <typename scalar_t>
-__device__ __forceinline__ void update_r(const scalar_t *rho_y,
-                                         const scalar_t *s_buffer, scalar_t *r,
-                                         scalar_t &alpha, const int v_dim) {
-  // dot product: and subtract with alpha
-  for (int i = 0; i < v_dim; i++) {
-    alpha -= rho_y[i] * r[i];
-  }
-  // scalar vector product:
-  for (int i = 0; i < v_dim; i++) {
-    r[i] = r[i] + alpha * s_buffer[i];
-  }
-}
-template <typename scalar_t>
-__device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq,
-                                         const scalar_t alpha,
-                                         const int v_dim) {
-  //
-  for (int i = 0; i < v_dim; i++) {
-    gq[i] = gq[i] - (alpha * y_buffer[i]);
-  }
-}
-template <typename scalar_t>
-__global__ void
-lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer,
-                      const scalar_t *y_buffer, const scalar_t *s_buffer,
-                      const scalar_t *grad_q, const float epsilon,
-                      const int batchSize, const int m, const int v_dim) {
-  // each thread writes one sphere of some link
-  const int t = blockDim.x * blockIdx.x + threadIdx.x; // batch
-  const int b_idx = t;
-  if (t >= (batchSize)) {
-    return;
-  }
-
-  // get thread start address:
-  const int b_start_scalar_adrs = b_idx * m;
-  const int b_start_vec_adrs = b_idx * m * v_dim;
-  const int b_step_start_adrs = b_idx * v_dim;
-
-  scalar_t rho_s[M * VDIM];
-
-  // copy floats to local buffer?
-  // y_buffer, s_buffer, rho_buffer
-  // compute rho_s
-  scalar_t loc_ybuf[M * VDIM];
-  scalar_t loc_sbuf[M * VDIM];
-  scalar_t loc_rho[M];
-  scalar_t gq[VDIM]; //, l_q[VDIM];
-  scalar_t alpha_buffer[M];
-  scalar_t t_1, t_2;
-
-  for (int i = 0; i < m * v_dim; i++) {
-    loc_ybuf[i] = y_buffer[b_start_vec_adrs + i];
-    loc_sbuf[i] = s_buffer[b_start_vec_adrs + i];
-  }
-  for (int i = 0; i < v_dim; i++) {
-    gq[i] = grad_q[b_step_start_adrs + i];
-  }
-  for (int i = 0; i < m; i++) {
-    loc_rho[i] = rho_buffer[b_start_scalar_adrs + i];
-  }
-
-  m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m);
-  // for loop over m
-  for (int i = m - 1; i > m - 2; i--) {
-    // l_start_vec_adrs = i * v_dim;
-    // scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim],
-    // v_dim);
-    vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim);
-    update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim);
-  }
-  // compute initial hessian:
-  vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1,
-              v_dim);
-  vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2,
-              v_dim);
-  t_1 = t_1 / t_2;
-  if (t_1 < 0) {
-    t_1 = 0;
-  }
-
-  t_1 += epsilon;
-  scalar_vec_product(t_1, &gq[0], &gq[0], v_dim);
-
-  m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m);
-
-  for (int i = 0; i < m; i++) {
-    // scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim],
-    // v_dim);
-    update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i],
-             v_dim);
-  }
-
-  // write gq to out grad:
-
-  for (int i = 0; i < v_dim; i++) {
-    step_vec[b_step_start_adrs + i] = -1.0 * gq[i];
-  }
-}
-
-template <typename psum_t>
-__forceinline__ __device__ psum_t warpReduce(psum_t v, int elems,
-                                             unsigned mask) {
-  psum_t val = v;
-  int shift = 1;
-  for (int i = elems; i > 1; i /= 2) {
-    val += __shfl_down_sync(mask, val, shift);
-    shift *= 2;
-  }
-  // val += __shfl_down_sync(mask, val, 1); // i=32
-  // val += __shfl_down_sync(mask, val, 2); // i=16
-  // val += __shfl_down_sync(mask, val, 4); // i=8
-  // val += __shfl_down_sync(mask, val, 8); // i=4
-  // val += __shfl_down_sync(mask, val, 16); // i=2
-  return val;
-}
-
-template <typename scalar_t, typename psum_t>
-__forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data,
-                                       scalar_t *result) {
-  psum_t val = v;
-  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
-  val += __shfl_down_sync(mask, val, 1);
-  val += __shfl_down_sync(mask, val, 2);
-  val += __shfl_down_sync(mask, val, 4);
-  val += __shfl_down_sync(mask, val, 8);
-  val += __shfl_down_sync(mask, val, 16);
-  // int leader = __ffs(mask) – 1;    // select a leader lane
-  int leader = 0;
-  if (threadIdx.x % 32 == leader) {
-    if (m < 32) {
-      result[0] = (scalar_t)val;
-    } else {
-      data[(threadIdx.x + 1) / 32] = val;
+    template<typename scalar_t>
+    __device__ __forceinline__ void
+    m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out,
+                         const int v_dim, const int m)
+    {
+      for (int j = 0; j < m; j++)
+      {
+        for (int i = 0; i < v_dim; i++)
+        {
+          out[j * v_dim + i] = a[j] * b[j * v_dim + i];
+        }
+      }
     }
-  }
-  if (m >= 32) {
 
-    __syncthreads();
+    template<typename scalar_t>
+    __device__ __forceinline__ void vec_vec_dot(const scalar_t *a,
+                                                const scalar_t *b, scalar_t& out,
+                                                const int v_dim)
+    {
+      for (int i = 0; i < v_dim; i++)
+      {
+        out += a[i] * b[i];
+      }
+    }
 
-    int elems = (m + 31) / 32;
-    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
-    if (threadIdx.x / 32 == 0) { // only the first warp will do this work
-      psum_t val2 = data[threadIdx.x % 32];
-      int shift = 1;
-      for (int i = elems - 1; i > 0; i /= 2) {
-        val2 += __shfl_down_sync(mask2, val2, shift);
+    template<typename scalar_t>
+    __device__ __forceinline__ void update_r(const scalar_t *rho_y,
+                                             const scalar_t *s_buffer, scalar_t *r,
+                                             scalar_t& alpha, const int v_dim)
+    {
+      // dot product: and subtract with alpha
+      for (int i = 0; i < v_dim; i++)
+      {
+        alpha -= rho_y[i] * r[i];
+      }
+
+      // scalar vector product:
+      for (int i = 0; i < v_dim; i++)
+      {
+        r[i] = r[i] + alpha * s_buffer[i];
+      }
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq,
+                                             const scalar_t alpha,
+                                             const int v_dim)
+    {
+      //
+      for (int i = 0; i < v_dim; i++)
+      {
+        gq[i] = gq[i] - (alpha * y_buffer[i]);
+      }
+    }
+
+    template<typename scalar_t>
+    __global__ void
+    lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer,
+                          const scalar_t *y_buffer, const scalar_t *s_buffer,
+                          const scalar_t *grad_q, const float epsilon,
+                          const int batchSize, const int m, const int v_dim)
+    {
+      // each thread writes one sphere of some link
+      const int t     = blockDim.x * blockIdx.x + threadIdx.x; // batch
+      const int b_idx = t;
+
+      if (t >= (batchSize))
+      {
+        return;
+      }
+
+      // get thread start address:
+      const int b_start_scalar_adrs = b_idx * m;
+      const int b_start_vec_adrs    = b_idx * m * v_dim;
+      const int b_step_start_adrs   = b_idx * v_dim;
+
+      scalar_t rho_s[M * VDIM];
+
+      // copy floats to local buffer?
+      // y_buffer, s_buffer, rho_buffer
+      // compute rho_s
+      scalar_t loc_ybuf[M * VDIM];
+      scalar_t loc_sbuf[M * VDIM];
+      scalar_t loc_rho[M];
+      scalar_t gq[VDIM]; // , l_q[VDIM];
+      scalar_t alpha_buffer[M];
+      scalar_t t_1, t_2;
+
+      for (int i = 0; i < m * v_dim; i++)
+      {
+        loc_ybuf[i] = y_buffer[b_start_vec_adrs + i];
+        loc_sbuf[i] = s_buffer[b_start_vec_adrs + i];
+      }
+
+      for (int i = 0; i < v_dim; i++)
+      {
+        gq[i] = grad_q[b_step_start_adrs + i];
+      }
+
+      for (int i = 0; i < m; i++)
+      {
+        loc_rho[i] = rho_buffer[b_start_scalar_adrs + i];
+      }
+
+      m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m);
+
+      // for loop over m
+      for (int i = m - 1; i > m - 2; i--)
+      {
+        // l_start_vec_adrs = i * v_dim;
+        // scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim],
+        // v_dim);
+        vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim);
+        update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim);
+      }
+
+      // compute initial hessian:
+      vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1,
+                  v_dim);
+      vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2,
+                  v_dim);
+      t_1 = t_1 / t_2;
+
+      if (t_1 < 0)
+      {
+        t_1 = 0;
+      }
+
+      t_1 += epsilon;
+      scalar_vec_product(t_1, &gq[0], &gq[0], v_dim);
+
+      m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m);
+
+      for (int i = 0; i < m; i++)
+      {
+        // scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim],
+        // v_dim);
+        update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i],
+                 v_dim);
+      }
+
+      // write gq to out grad:
+
+      for (int i = 0; i < v_dim; i++)
+      {
+        step_vec[b_step_start_adrs + i] = -1.0 * gq[i];
+      }
+    }
+
+    template<typename psum_t>
+    __forceinline__ __device__ psum_t warpReduce(psum_t v, int elems,
+                                                 unsigned mask)
+    {
+      psum_t val   = v;
+      int    shift = 1;
+
+      for (int i = elems; i > 1; i /= 2)
+      {
+        val   += __shfl_down_sync(mask, val, shift);
         shift *= 2;
       }
-      // int leader = __ffs(mask2) – 1;    // select a leader lane
+
+      // val += __shfl_down_sync(mask, val, 1); // i=32
+      // val += __shfl_down_sync(mask, val, 2); // i=16
+      // val += __shfl_down_sync(mask, val, 4); // i=8
+      // val += __shfl_down_sync(mask, val, 8); // i=4
+      // val += __shfl_down_sync(mask, val, 16); // i=2
+      return val;
+    }
+
+    template<typename scalar_t, typename psum_t>
+    __forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data,
+                                           scalar_t *result)
+    {
+      psum_t   val  = v;
+      unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
+
+      val += __shfl_down_sync(mask, val, 1);
+      val += __shfl_down_sync(mask, val, 2);
+      val += __shfl_down_sync(mask, val, 4);
+      val += __shfl_down_sync(mask, val, 8);
+      val += __shfl_down_sync(mask, val, 16);
+
+      // int leader = __ffs(mask) – 1;    // select a leader lane
       int leader = 0;
-      if (threadIdx.x % 32 == leader) {
-        result[0] = (scalar_t)val2;
+
+      if (threadIdx.x % 32 == leader)
+      {
+        if (m < 32)
+        {
+          result[0] = (scalar_t)val;
+        }
+        else
+        {
+          data[(threadIdx.x + 1) / 32] = val;
+        }
+      }
+
+      if (m >= 32)
+      {
+        __syncthreads();
+
+        int elems      = (m + 31) / 32;
+        unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+
+        if (threadIdx.x / 32 == 0) // only the first warp will do this work
+        {
+          psum_t val2  = data[threadIdx.x % 32];
+          int    shift = 1;
+
+          for (int i = elems - 1; i > 0; i /= 2)
+          {
+            val2  += __shfl_down_sync(mask2, val2, shift);
+            shift *= 2;
+          }
+
+          // int leader = __ffs(mask2) – 1;    // select a leader lane
+          int leader = 0;
+
+          if (threadIdx.x % 32 == leader)
+          {
+            result[0] = (scalar_t)val2;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+    // blockReduce
+    template<typename scalar_t, typename psum_t>
+    __forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data,
+                                              scalar_t *result)
+    {
+      unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
+      psum_t   val  = warpReduce(v, 32, mask);
+
+      // int leader = __ffs(mask) – 1;    // select a leader lane
+      int leader = 0;
+
+      if (threadIdx.x % 32 == leader)
+      {
+        data[(threadIdx.x + 1) / 32] = val;
+      }
+
+      if (m >= 32)
+      {
+        __syncthreads();
+
+        int elems      = (m + 31) / 32;
+        unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+
+        if (threadIdx.x / 32 == 0) // only the first warp will do this work
+        {
+          psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2);
+
+          // // int leader = __ffs(mask2) – 1;    // select a leader lane
+          if (threadIdx.x == leader)
+          {
+            result[0] = (scalar_t)val2;
+          }
+        }
+      }
+      else
+      {
+        if (threadIdx.x == leader)
+        {
+          result[0] = (scalar_t)val;
+        }
+      }
+      __syncthreads();
+    }
+
+    template<typename scalar_t, typename psum_t>
+    __inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2,
+                                   const int m, psum_t *data, scalar_t *result)
+    {
+      scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x];
+
+      reduce(val, m, data, result);
+    }
+
+    template<typename scalar_t>__inline__ __device__ scalar_t relu(scalar_t var)
+    {
+      if (var < 0)
+        return 0;
+      else
+        return var;
+    }
+
+    //////////////////////////////////////////////////////////
+    // one block per batch
+    // v_dim threads per block
+    //////////////////////////////////////////////////////////
+    template<typename scalar_t, typename psum_t>
+    __global__ void lbfgs_step_kernel(scalar_t *step_vec,       // b x 175
+                                      scalar_t *rho_buffer,     // m x b x 1
+                                      const scalar_t *y_buffer, // m x b x 175
+                                      const scalar_t *s_buffer, // m x b x 175
+                                      const scalar_t *grad_q,   // b x 175
+                                      const float epsilon, const int batchsize,
+                                      const int m, const int v_dim)
+    {
+      __shared__ psum_t
+        data[32];                  // temporary buffer needed for block-wide reduction
+      __shared__ scalar_t
+        result;                    // result of the reduction or vector-vector dot product
+      __shared__ scalar_t gq[175]; /// gq  = batch * v_dim
+
+      assert(v_dim < 176);
+      int batch = blockIdx.x;      // one block per batch
+
+      if (threadIdx.x >= v_dim)
+        return;
+
+      gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+      scalar_t alpha_buffer[16];
+
+      // assert(m<16); // allocating a buffer assuming m < 16
+
+      for (int i = m - 1; i > -1; i--)
+      {
+        dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
+            &data[0], &result);
+        alpha_buffer[i] = result * rho_buffer[i * batchsize + batch];
+        gq[threadIdx.x] =
+          gq[threadIdx.x] -
+          alpha_buffer[i] *
+          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      }
+
+      // compute var1
+      scalar_t val1 =
+        y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      scalar_t val2 =
+        s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      reduce(val1 * val1, v_dim, data, &result);
+      scalar_t denominator = result;
+      reduce(val1 * val2, v_dim, data, &result);
+      scalar_t numerator = result;
+      scalar_t var1      = numerator / denominator;
+
+      scalar_t gamma = relu(var1) + epsilon; // epsilon
+
+      gq[threadIdx.x] = gamma * gq[threadIdx.x];
+
+      for (int i = 0; i < m; i++)
+      {
+        dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
+            &data[0], &result);
+        gq[threadIdx.x] =
+          gq[threadIdx.x] +
+          (alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) *
+          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      }
+
+      step_vec[batch * v_dim + threadIdx.x] =
+        -1 * gq[threadIdx.x]; // copy from shared memory to global memory
+    }
+
+    template<typename scalar_t, typename psum_t>
+    __global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer,   // m x b x 1
+                                               scalar_t *y_buffer,     // m x b x 175
+                                               scalar_t *s_buffer,     // m x b x 175
+                                               scalar_t *q,            // b x 175
+                                               scalar_t *x_0,          // b x 175
+                                               scalar_t *grad_0,       // b x 175
+                                               const scalar_t *grad_q, // b x 175
+                                               const int batchsize, const int m,
+                                               const int v_dim)
+    {
+      __shared__ psum_t
+        data[32]; // temporary buffer needed for block-wide reduction
+      __shared__ scalar_t
+        result;   // result of the reduction or vector-vector dot product
+
+      // __shared__ scalar_t y[175]; // temporary shared memory storage
+      // __shared__ scalar_t s[175]; // temporary shared memory storage
+      assert(v_dim <= VDIM);
+      int batch = blockIdx.x; // one block per batch
+
+      if (threadIdx.x >= v_dim)
+        return;
+
+      scalar_t y =
+        grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x];
+      scalar_t s =
+        q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+      reduce(y * s, v_dim, &data[0], &result);
+
+      for (int i = 1; i < m; i++)
+      {
+        s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+        y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      }
+
+      // __syncthreads();
+
+      s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+      y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+      grad_0[batch * v_dim +
+             threadIdx.x]                                                 =
+        grad_q[batch * v_dim + threadIdx.x];
+      x_0[batch * v_dim +
+          threadIdx.x] =
+        q[batch * v_dim + threadIdx.x];
+
+      if (threadIdx.x == 0)
+      {
+        scalar_t rho = 1 / result;
+
+        for (int i = 1; i < m; i++)
+        {
+          rho_buffer[(i - 1) * batchsize + batch] =
+            rho_buffer[i * batchsize + batch];
+        }
+        rho_buffer[(m - 1) * batchsize + batch] = rho;
       }
     }
-  }
-  __syncthreads();
-}
 
-// blockReduce
-template <typename scalar_t, typename psum_t>
-__forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data,
-                                          scalar_t *result) {
-  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
-  psum_t val = warpReduce(v, 32, mask);
+    template<typename scalar_t, typename psum_t>
+    __global__ void reduce_kernel(
+      scalar_t *vec1,         // b x 175
+      scalar_t *vec2,         // b x 175
+      scalar_t *rho_buffer,   // m x b x 1
 
-  // int leader = __ffs(mask) – 1;    // select a leader lane
-  int leader = 0;
-  if (threadIdx.x % 32 == leader) {
-    data[(threadIdx.x + 1) / 32] = val;
-  }
+      scalar_t *sum_out,      // m x b x 1
 
-  if (m >= 32) {
+      const int batchsize, const int m,
+      const int v_dim)        // s_buffer and y_buffer are not rolled by default
+    {
+      __shared__ psum_t
+        data[32];             // temporary buffer needed for block-wide reduction
+      __shared__ scalar_t
+          result;             // result of the reduction or vector-vector dot product
+      int batch = blockIdx.x; // one block per batch
 
-    __syncthreads();
+      if (threadIdx.x >= v_dim)
+        return;
 
-    int elems = (m + 31) / 32;
-    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
-    if (threadIdx.x / 32 == 0) { // only the first warp will do this work
-      psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2);
+      ////////////////////
+      // update_buffer
+      ////////////////////
+      scalar_t y = vec1[batch * v_dim + threadIdx.x];
+      scalar_t s = vec2[batch * v_dim + threadIdx.x];
 
-      // // int leader = __ffs(mask2) – 1;    // select a leader lane
-      if (threadIdx.x == leader) {
-        result[0] = (scalar_t)val2;
+      reduce(y * s, v_dim, &data[0], &result);
+      scalar_t numerator = result;
+
+      if (threadIdx.x == 0)
+      {
+        sum_out[batch] = 1 / numerator;
+      }
+
+      // return;
+      if (threadIdx.x < m - 1)
+      {
+        // m thread participate to shif the values
+        // this is safe as m<32  and this happens in lockstep
+        rho_buffer[threadIdx.x * batchsize + batch] =
+          rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+      }
+      else if (threadIdx.x == m - 1)
+      {
+        scalar_t rho = (1 / numerator);
+        rho_buffer[threadIdx.x * batchsize + batch] = rho;
       }
     }
-  } else {
-    if (threadIdx.x == leader) {
-      result[0] = (scalar_t)val;
-    }
-  }
-  __syncthreads();
-}
 
-template <typename scalar_t, typename psum_t>
-__inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2,
-                               const int m, psum_t *data, scalar_t *result) {
+    template<typename scalar_t, typename psum_t, bool rolled_ys>
+    __global__ void lbfgs_update_buffer_and_step_v1(
+      scalar_t *step_vec,     // b x 175
+      scalar_t *rho_buffer,   // m x b x 1
+      scalar_t *y_buffer,     // m x b x 175
+      scalar_t *s_buffer,     // m x b x 175
+      scalar_t *q,            // b x 175
+      scalar_t *x_0,          // b x 175
+      scalar_t *grad_0,       // b x 175
+      const scalar_t *grad_q, // b x 175
+      const float epsilon, const int batchsize, const int m, const int v_dim,
+      const bool stable_mode =
+        false)                // s_buffer and y_buffer are not rolled by default
+    {
+      // extern __shared__ scalar_t alpha_buffer_sh[];
+      extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
+      scalar_t *my_smem_rc      = reinterpret_cast<scalar_t *>(my_smem);
+      scalar_t *alpha_buffer_sh = &my_smem_rc[0];              // m*blockDim.x
+      scalar_t *rho_buffer_sh   = &my_smem_rc[m * blockDim.x]; // batchsize*m
+      scalar_t *s_buffer_sh     =
+        &my_smem_rc[m * blockDim.x + m * batchsize];           // m*blockDim.x
+      scalar_t *y_buffer_sh =
+        &my_smem_rc[2 * m * blockDim.x + m * batchsize];       // m*blockDim.x
 
-  scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x];
-  reduce(val, m, data, result);
-}
+      __shared__ psum_t
+        data[32];                                              // temporary buffer needed for
+                                                               // block-wide reduction
+      __shared__ scalar_t
+          result;                                              // result of the reduction or
+                                                               // vector-vector dot product
+      int batch = blockIdx.x;                                  // one block per batch
 
-template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
-  if (var < 0)
-    return 0;
-  else
-    return var;
-}
+      if (threadIdx.x >= v_dim)
+        return;
 
-//////////////////////////////////////////////////////////
-// one block per batch
-// v_dim threads per block
-//////////////////////////////////////////////////////////
-template <typename scalar_t, typename psum_t>
-__global__ void lbfgs_step_kernel(scalar_t *step_vec,       // b x 175
-                                  scalar_t *rho_buffer,     // m x b x 1
-                                  const scalar_t *y_buffer, // m x b x 175
-                                  const scalar_t *s_buffer, // m x b x 175
-                                  const scalar_t *grad_q,   // b x 175
-                                  const float epsilon, const int batchsize,
-                                  const int m, const int v_dim) {
-  __shared__ psum_t
-      data[32]; // temporary buffer needed for block-wide reduction
-  __shared__ scalar_t
-      result; // result of the reduction or vector-vector dot product
-  __shared__ scalar_t gq[175]; /// gq  = batch * v_dim
-  assert(v_dim < 176);
-  int batch = blockIdx.x; // one block per batch
-  if (threadIdx.x >= v_dim)
-    return;
+      scalar_t gq;
+      gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
 
-  gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+      ////////////////////
+      // update_buffer
+      ////////////////////
+      scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
 
-  scalar_t alpha_buffer[16];
-  // assert(m<16); // allocating a buffer assuming m < 16
+      // if y is close to zero
+      scalar_t s =
+        q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+      reduce_v1(y * s, v_dim, &data[0], &result);
 
-  for (int i = m - 1; i > -1; i--) {
-    dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
-        &data[0], &result);
-    alpha_buffer[i] = result * rho_buffer[i * batchsize + batch];
-    gq[threadIdx.x] =
-        gq[threadIdx.x] -
-        alpha_buffer[i] *
-            y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  }
+      // reduce(y * s, v_dim, &data[0], &result);
+      scalar_t numerator = result;
 
-  // compute var1
-  scalar_t val1 =
-      y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  scalar_t val2 =
-      s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  reduce(val1 * val1, v_dim, data, &result);
-  scalar_t denominator = result;
-  reduce(val1 * val2, v_dim, data, &result);
-  scalar_t numerator = result;
-  scalar_t var1 = numerator / denominator;
+      // scalar_t rho = 1.0/numerator;
 
-  scalar_t gamma = relu(var1) + epsilon; // epsilon
+      if (!rolled_ys)
+      {
+#pragma unroll
 
-  gq[threadIdx.x] = gamma * gq[threadIdx.x];
-
-  for (int i = 0; i < m; i++) {
-    dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
-        &data[0], &result);
-    gq[threadIdx.x] =
-        gq[threadIdx.x] +
-        (alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) *
+        for (int i = 1; i < m; i++)
+        {
+          scalar_t st =
             s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  }
+          scalar_t yt =
+            y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+          s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st;
+          y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt;
+          s_buffer_sh[m * threadIdx.x + i - 1]                                = st;
+          y_buffer_sh[m * threadIdx.x + i - 1]                                = yt;
+        }
+      }
 
-  step_vec[batch * v_dim + threadIdx.x] =
-      -1 * gq[threadIdx.x]; // copy from shared memory to global memory
-}
+      s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+      y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+      s_buffer_sh[m * threadIdx.x + m - 1]                                = s;
+      y_buffer_sh[m * threadIdx.x + m - 1]                                = y;
+      grad_0[batch * v_dim + threadIdx.x]                                 = gq;
+      x_0[batch * v_dim +
+          threadIdx.x]                                                    =
+        q[batch * v_dim + threadIdx.x];
 
-template <typename scalar_t, typename psum_t>
-__global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer, // m x b x 1
-                                           scalar_t *y_buffer,   // m x b x 175
-                                           scalar_t *s_buffer,   // m x b x 175
-                                           scalar_t *q,          // b x 175
-                                           scalar_t *x_0,        // b x 175
-                                           scalar_t *grad_0,     // b x 175
-                                           const scalar_t *grad_q, // b x 175
-                                           const int batchsize, const int m,
-                                           const int v_dim) {
-  __shared__ psum_t
-      data[32]; // temporary buffer needed for block-wide reduction
-  __shared__ scalar_t
-      result; // result of the reduction or vector-vector dot product
-  // __shared__ scalar_t y[175]; // temporary shared memory storage
-  // __shared__ scalar_t s[175]; // temporary shared memory storage
-  assert(v_dim <= VDIM);
-  int batch = blockIdx.x; // one block per batch
-  if (threadIdx.x >= v_dim)
-    return;
+      if (threadIdx.x < m - 1)
+      {
+        // m thread participate to shif the values
+        // this is safe as m<32  and this happens in lockstep
+        scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+        rho_buffer[threadIdx.x * batchsize + batch]    = rho;
+        rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
+      }
 
-  scalar_t y =
-      grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x];
-  scalar_t s =
-      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
-  reduce(y * s, v_dim, &data[0], &result);
+      if (threadIdx.x == m - 1)
+      {
+        scalar_t rho = 1.0 / numerator;
 
-  for (int i = 1; i < m; i++) {
-    s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
-        s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-    y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
-        y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  }
+        // if this is nan, make it zero:
+        if (stable_mode && (numerator == 0.0))
+        {
+          rho = 0.0;
+        }
+        rho_buffer[threadIdx.x * batchsize + batch]    = rho;
+        rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
+      }
 
-  // __syncthreads();
+      // return;
+      __syncthreads();
 
-  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
-  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
-  grad_0[batch * v_dim + threadIdx.x] = grad_q[batch * v_dim + threadIdx.x];
-  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
-
-  if (threadIdx.x == 0) {
-    scalar_t rho = 1 / result;
-    for (int i = 1; i < m; i++) {
-      rho_buffer[(i - 1) * batchsize + batch] =
-          rho_buffer[i * batchsize + batch];
-    }
-    rho_buffer[(m - 1) * batchsize + batch] = rho;
-  }
-}
-
-template <typename scalar_t, typename psum_t>
-__global__ void reduce_kernel(
-    scalar_t *vec1,       // b x 175
-    scalar_t *vec2,       // b x 175
-    scalar_t *rho_buffer, // m x b x 1
-
-    scalar_t *sum_out, // m x b x 1
-
-    const int batchsize, const int m,
-    const int v_dim) // s_buffer and y_buffer are not rolled by default
-{
-  __shared__ psum_t
-      data[32]; // temporary buffer needed for block-wide reduction
-  __shared__ scalar_t
-      result; // result of the reduction or vector-vector dot product
-  int batch = blockIdx.x; // one block per batch
-  if (threadIdx.x >= v_dim)
-    return;
-
-  ////////////////////
-  // update_buffer
-  ////////////////////
-  scalar_t y = vec1[batch * v_dim + threadIdx.x];
-  scalar_t s = vec2[batch * v_dim + threadIdx.x];
-
-  reduce(y * s, v_dim, &data[0], &result);
-  scalar_t numerator = result;
-  if (threadIdx.x == 0) {
-    sum_out[batch] = 1 / numerator;
-  }
-  // return;
-  if (threadIdx.x < m - 1) {
-    // m thread participate to shif the values
-    // this is safe as m<32  and this happens in lockstep
-    rho_buffer[threadIdx.x * batchsize + batch] =
-        rho_buffer[(threadIdx.x + 1) * batchsize + batch];
-  } else if (threadIdx.x == m - 1) {
-    scalar_t rho = (1 / numerator);
-    rho_buffer[threadIdx.x * batchsize + batch] = rho;
-  }
-}
-template <typename scalar_t, typename psum_t, bool rolled_ys>
-__global__ void lbfgs_update_buffer_and_step_v1(
-    scalar_t *step_vec,     // b x 175
-    scalar_t *rho_buffer,   // m x b x 1
-    scalar_t *y_buffer,     // m x b x 175
-    scalar_t *s_buffer,     // m x b x 175
-    scalar_t *q,            // b x 175
-    scalar_t *x_0,          // b x 175
-    scalar_t *grad_0,       // b x 175
-    const scalar_t *grad_q, // b x 175
-    const float epsilon, const int batchsize, const int m, const int v_dim,
-    const bool stable_mode =
-        false) // s_buffer and y_buffer are not rolled by default
-{
-  // extern __shared__ scalar_t alpha_buffer_sh[];
-  extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
-  scalar_t *my_smem_rc = reinterpret_cast<scalar_t *>(my_smem);
-  scalar_t *alpha_buffer_sh = &my_smem_rc[0];            // m*blockDim.x
-  scalar_t *rho_buffer_sh = &my_smem_rc[m * blockDim.x]; // batchsize*m
-  scalar_t *s_buffer_sh =
-      &my_smem_rc[m * blockDim.x + m * batchsize]; // m*blockDim.x
-  scalar_t *y_buffer_sh =
-      &my_smem_rc[2 * m * blockDim.x + m * batchsize]; // m*blockDim.x
-
-  __shared__ psum_t
-      data[32]; // temporary buffer needed for block-wide reduction
-  __shared__ scalar_t
-      result; // result of the reduction or vector-vector dot product
-  int batch = blockIdx.x; // one block per batch
-  if (threadIdx.x >= v_dim)
-    return;
-
-  scalar_t gq;
-  gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
-
-  ////////////////////
-  // update_buffer
-  ////////////////////
-  scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
-  // if y is close to zero
-  scalar_t s =
-      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
-  reduce_v1(y * s, v_dim, &data[0], &result);
-  //reduce(y * s, v_dim, &data[0], &result);
-  scalar_t numerator = result;
-  // scalar_t rho = 1.0/numerator;
-
-  if (!rolled_ys) {
-#pragma unroll
-    for (int i = 1; i < m; i++) {
-      scalar_t st =
-          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-      scalar_t yt =
-          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-      s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st;
-      y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt;
-      s_buffer_sh[m * threadIdx.x + i - 1] = st;
-      y_buffer_sh[m * threadIdx.x + i - 1] = yt;
-    }
-  }
-
-  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
-  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
-  s_buffer_sh[m * threadIdx.x + m - 1] = s;
-  y_buffer_sh[m * threadIdx.x + m - 1] = y;
-  grad_0[batch * v_dim + threadIdx.x] = gq;
-  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
-  if (threadIdx.x < m - 1) {
-    // m thread participate to shif the values
-    // this is safe as m<32  and this happens in lockstep
-    scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch];
-    rho_buffer[threadIdx.x * batchsize + batch] = rho;
-    rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
-  }
-  if (threadIdx.x == m - 1) {
-    scalar_t rho = 1.0 / numerator;
-    // if this is nan, make it zero:
-    if (stable_mode && numerator == 0.0) {
-      rho = 0.0;
-    }
-    rho_buffer[threadIdx.x * batchsize + batch] = rho;
-    rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
-  }
-  // return;
-  __syncthreads();
-  ////////////////////
-  // step
-  ////////////////////
-  // scalar_t alpha_buffer[16];
-  // assert(m<16); // allocating a buffer assuming m < 16
+      ////////////////////
+      // step
+      ////////////////////
+      // scalar_t alpha_buffer[16];
+      // assert(m<16); // allocating a buffer assuming m < 16
 
 #pragma unroll
-  for (int i = m - 1; i > -1; i--) {
-    // reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
-    // v_dim, &data[0], &result);
-    reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
-    alpha_buffer_sh[threadIdx.x * m + i] =
-        result * rho_buffer_sh[i * batchsize + batch];
-    // gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim +
-    // batch*v_dim + threadIdx.x];
-    gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
-                  y_buffer_sh[m * threadIdx.x + i];
-  }
 
-  // compute var1
-  reduce_v1(y * y, v_dim, data, &result);
-  scalar_t denominator = result;
-  // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
-  // scalar_t numerator = result;
-  scalar_t var1 = numerator / denominator;
+      for (int i = m - 1; i > -1; i--)
+      {
+        // reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
+        // v_dim, &data[0], &result);
+        reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
+        alpha_buffer_sh[threadIdx.x * m + i] =
+          result * rho_buffer_sh[i * batchsize + batch];
 
-  // To improve stability, uncomment below line: [this however leads to poor
-  // convergence]
+        // gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim +
+        // batch*v_dim + threadIdx.x];
+        gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
+             y_buffer_sh[m * threadIdx.x + i];
+      }
 
-  if (stable_mode && denominator == 0.0) {
-    var1 = epsilon;
-  }
+      // compute var1
+      reduce_v1(y * y, v_dim, data, &result);
+      scalar_t denominator = result;
 
-  scalar_t gamma = relu(var1);
-  gq = gamma * gq;
+      // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
+      // scalar_t numerator = result;
+      scalar_t var1 = numerator / denominator;
+
+      // To improve stability, uncomment below line: [this however leads to poor
+      // convergence]
+
+      if (stable_mode && (denominator == 0.0))
+      {
+        var1 = epsilon;
+      }
+
+      scalar_t gamma = relu(var1);
+      gq = gamma * gq;
 
 #pragma unroll
-  for (int i = 0; i < m; i++) {
-    // reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
-    // v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] -
-    // result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim +
-    // batch*v_dim + threadIdx.x];
-    reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
-    gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
-               result * rho_buffer_sh[i * batchsize + batch]) *
-                  s_buffer_sh[m * threadIdx.x + i];
-  }
 
-  step_vec[batch * v_dim + threadIdx.x] =
-      -1.0 * gq; // copy from shared memory to global memory
-}
+      for (int i = 0; i < m; i++)
+      {
+        // reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
+        // v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] -
+        // result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim +
+        // batch*v_dim + threadIdx.x];
+        reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
+        gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
+                   result * rho_buffer_sh[i * batchsize + batch]) *
+             s_buffer_sh[m * threadIdx.x + i];
+      }
 
-// (32/M) rolls per warp
-// Threads in a warp in a GPU execute in a lock-step. We leverage that to do
-// the roll without using temporary storage or explicit synchronization.
-template <typename scalar_t>
-__global__ void lbfgs_roll(scalar_t *a, // m x b x 175
-                           scalar_t *b, // m x b x 175
-                           const int m_t, const int batchsize, const int m,
-                           const int v_dim) {
-  assert(m_t <= 32);
-  int t = blockDim.x * blockIdx.x + threadIdx.x;
-  if (t >= m_t * v_dim * batchsize)
-    return;
-  int _m = t % m_t;
-  int _v_dim = (t / m_t) % v_dim;
-  int batch = t / (m * v_dim); // this line could be wrong?
-
-  if (_m < m - 1) {
-    a[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
-        a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
-    b[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
-        b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
-  }
-}
-
-template <typename scalar_t, typename psum_t, bool rolled_ys>
-__global__ void lbfgs_update_buffer_and_step(
-    scalar_t *step_vec,     // b x 175
-    scalar_t *rho_buffer,   // m x b x 1
-    scalar_t *y_buffer,     // m x b x 175
-    scalar_t *s_buffer,     // m x b x 175
-    scalar_t *q,            // b x 175
-    scalar_t *x_0,          // b x 175
-    scalar_t *grad_0,       // b x 175
-    const scalar_t *grad_q, // b x 175
-    const float epsilon, const int batchsize, const int m, const int v_dim,
-    const bool stable_mode =
-        false) // s_buffer and y_buffer are not rolled by default
-{
-  // extern __shared__ scalar_t alpha_buffer_sh[];
-  extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
-  scalar_t *alpha_buffer_sh = reinterpret_cast<scalar_t *>(my_smem);
-
-  __shared__ psum_t
-      data[32]; // temporary buffer needed for block-wide reduction
-  __shared__ scalar_t
-      result; // result of the reduction or vector-vector dot product
-  int batch = blockIdx.x; // one block per batch
-  if (threadIdx.x >= v_dim)
-    return;
-
-  scalar_t gq;
-  gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
-
-  ////////////////////
-  // update_buffer
-  ////////////////////
-  scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
-  // if y is close to zero
-  scalar_t s =
-      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
-  reduce(y * s, v_dim, &data[0], &result);
-  scalar_t numerator = result;
-  // scalar_t rho = 1.0/numerator;
-
-  if (!rolled_ys) {
-    for (int i = 1; i < m; i++) {
-      s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
-          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-      y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
-          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      step_vec[batch * v_dim + threadIdx.x] =
+        -1.0 * gq; // copy from shared memory to global memory
     }
-  }
 
-  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
-  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
-  grad_0[batch * v_dim + threadIdx.x] = gq;
-  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
-  if (threadIdx.x < m - 1) {
-    // m thread participate to shif the values
-    // this is safe as m<32  and this happens in lockstep
-    rho_buffer[threadIdx.x * batchsize + batch] =
-        rho_buffer[(threadIdx.x + 1) * batchsize + batch];
-  }
-  if (threadIdx.x == m - 1) {
-    scalar_t rho = 1.0 / numerator;
-    // if this is nan, make it zero:
-    if (stable_mode && numerator == 0.0) {
-      rho = 0.0;
+    // (32/M) rolls per warp
+    // Threads in a warp in a GPU execute in a lock-step. We leverage that to do
+    // the roll without using temporary storage or explicit synchronization.
+    template<typename scalar_t>
+    __global__ void lbfgs_roll(scalar_t *a, // m x b x 175
+                               scalar_t *b, // m x b x 175
+                               const int m_t, const int batchsize, const int m,
+                               const int v_dim)
+    {
+      assert(m_t <= 32);
+      int t = blockDim.x * blockIdx.x + threadIdx.x;
+
+      if (t >= m_t * v_dim * batchsize)
+        return;
+
+      int _m     = t % m_t;
+      int _v_dim = (t / m_t) % v_dim;
+      int batch  = t / (m * v_dim); // this line could be wrong?
+
+      if (_m < m - 1)
+      {
+        a[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
+          a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
+        b[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
+          b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
+      }
     }
-    rho_buffer[threadIdx.x * batchsize + batch] = rho;
-  }
-  // return;
-  //__syncthreads();
-  ////////////////////
-  // step
-  ////////////////////
-  // scalar_t alpha_buffer[16];
-  // assert(m<16); // allocating a buffer assuming m < 16
+
+    template<typename scalar_t, typename psum_t, bool rolled_ys>
+    __global__ void lbfgs_update_buffer_and_step(
+      scalar_t *step_vec,     // b x 175
+      scalar_t *rho_buffer,   // m x b x 1
+      scalar_t *y_buffer,     // m x b x 175
+      scalar_t *s_buffer,     // m x b x 175
+      scalar_t *q,            // b x 175
+      scalar_t *x_0,          // b x 175
+      scalar_t *grad_0,       // b x 175
+      const scalar_t *grad_q, // b x 175
+      const float epsilon, const int batchsize, const int m, const int v_dim,
+      const bool stable_mode =
+        false)                // s_buffer and y_buffer are not rolled by default
+    {
+      // extern __shared__ scalar_t alpha_buffer_sh[];
+      extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
+      scalar_t *alpha_buffer_sh = reinterpret_cast<scalar_t *>(my_smem);
+
+      __shared__ psum_t
+        data[32];             // temporary buffer needed for block-wide reduction
+      __shared__ scalar_t
+          result;             // result of the reduction or vector-vector dot product
+      int batch = blockIdx.x; // one block per batch
+
+      if (threadIdx.x >= v_dim)
+        return;
+
+      scalar_t gq;
+      gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+      ////////////////////
+      // update_buffer
+      ////////////////////
+      scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
+
+      // if y is close to zero
+      scalar_t s =
+        q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+      reduce(y * s, v_dim, &data[0], &result);
+      scalar_t numerator = result;
+
+      // scalar_t rho = 1.0/numerator;
+
+      if (!rolled_ys)
+      {
+        for (int i = 1; i < m; i++)
+        {
+          s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+            s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+          y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+            y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+        }
+      }
+
+      s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+      y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+      grad_0[batch * v_dim + threadIdx.x]                                 = gq;
+      x_0[batch * v_dim +
+          threadIdx.x]                                                    =
+        q[batch * v_dim + threadIdx.x];
+
+      if (threadIdx.x < m - 1)
+      {
+        // m thread participate to shif the values
+        // this is safe as m<32  and this happens in lockstep
+        rho_buffer[threadIdx.x * batchsize + batch] =
+          rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+      }
+
+      if (threadIdx.x == m - 1)
+      {
+        scalar_t rho = 1.0 / numerator;
+
+        // if this is nan, make it zero:
+        if (stable_mode && (numerator == 0.0))
+        {
+          rho = 0.0;
+        }
+        rho_buffer[threadIdx.x * batchsize + batch] = rho;
+      }
+
+      // return;
+      // __syncthreads();
+      ////////////////////
+      // step
+      ////////////////////
+      // scalar_t alpha_buffer[16];
+      // assert(m<16); // allocating a buffer assuming m < 16
 
 #pragma unroll
-  for (int i = m - 1; i > -1; i--) {
-    reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
-           v_dim, &data[0], &result);
-    alpha_buffer_sh[threadIdx.x * m + i] =
-        result * rho_buffer[i * batchsize + batch];
-    gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
-                  y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  }
 
-  // compute var1
-  reduce(y * y, v_dim, data, &result);
-  scalar_t denominator = result;
-  // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
-  // scalar_t numerator = result;
-  scalar_t var1 = numerator / denominator;
+      for (int i = m - 1; i > -1; i--)
+      {
+        reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
+               v_dim, &data[0], &result);
+        alpha_buffer_sh[threadIdx.x * m + i] =
+          result * rho_buffer[i * batchsize + batch];
+        gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
+             y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      }
 
-  // To improve stability, uncomment below line: [this however leads to poor
-  // convergence]
+      // compute var1
+      reduce(y * y, v_dim, data, &result);
+      scalar_t denominator = result;
 
-  if (stable_mode && denominator == 0.0) {
-    var1 = epsilon;
-  }
+      // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
+      // scalar_t numerator = result;
+      scalar_t var1 = numerator / denominator;
 
-  scalar_t gamma = relu(var1);
+      // To improve stability, uncomment below line: [this however leads to poor
+      // convergence]
 
-  gq = gamma * gq;
+      if (stable_mode && (denominator == 0.0))
+      {
+        var1 = epsilon;
+      }
+
+      scalar_t gamma = relu(var1);
+
+      gq = gamma * gq;
 
 #pragma unroll
-  for (int i = 0; i < m; i++) {
-    reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
-           v_dim, &data[0], &result);
-    gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
-               result * rho_buffer[i * batchsize + batch]) *
-                  s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
-  }
 
-  step_vec[batch * v_dim + threadIdx.x] =
-      -1.0 * gq; // copy from shared memory to global memory
-}
+      for (int i = 0; i < m; i++)
+      {
+        reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
+               v_dim, &data[0], &result);
+        gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
+                   result * rho_buffer[i * batchsize + batch]) *
+             s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      }
 
-} // namespace Optimization
+      step_vec[batch * v_dim + threadIdx.x] =
+        -1.0 * gq; // copy from shared memory to global memory
+    }
+  } // namespace Optimization
 } // namespace Curobo
 std::vector<torch::Tensor>
 lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
                 torch::Tensor y_buffer, torch::Tensor s_buffer,
                 torch::Tensor grad_q, const float epsilon, const int batch_size,
-                const int m, const int v_dim) {
+                const int m, const int v_dim)
+{
   using namespace Curobo::Optimization;
   const int threadsPerBlock = 128;
-  const int blocksPerGrid =
-      ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+  const int blocksPerGrid   =
+    ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
 
   // launch threads per batch:
   // int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
@@ -739,26 +873,26 @@ lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      step_vec.scalar_type(), "lbfgs_step_cu", ([&] {
-        lbfgs_step_kernel_old<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock,
-               v_dim * sizeof(step_vec.scalar_type()), stream>>>(
-                step_vec.data_ptr<scalar_t>(), rho_buffer.data_ptr<scalar_t>(),
-                y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
-                grad_q.data_ptr<scalar_t>(), epsilon, batch_size, m, v_dim);
-      }));
+    step_vec.scalar_type(), "lbfgs_step_cu", ([&] {
+    lbfgs_step_kernel_old<scalar_t>
+      << < blocksPerGrid, threadsPerBlock,
+      v_dim * sizeof(step_vec.scalar_type()), stream >> > (
+      step_vec.data_ptr<scalar_t>(), rho_buffer.data_ptr<scalar_t>(),
+      y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+      grad_q.data_ptr<scalar_t>(), epsilon, batch_size, m, v_dim);
+  }));
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {step_vec};
+  return { step_vec };
 }
 
 std::vector<torch::Tensor>
 lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
                   torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
                   torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
-                  const int m, const int v_dim) {
-
+                  const int m, const int v_dim)
+{
   using namespace Curobo::Optimization;
 
   // const int threadsPerBlock = 128;
@@ -767,23 +901,23 @@ lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
   int threadsPerBlock = v_dim;
 
   const int blocksPerGrid =
-      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+    batch_size; // ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      y_buffer.scalar_type(), "lbfgs_update_cu", ([&] {
-        lbfgs_update_buffer_kernel<scalar_t, scalar_t>
-            <<<blocksPerGrid, threadsPerBlock,
-               v_dim * sizeof(y_buffer.scalar_type()), stream>>>(
-                rho_buffer.data_ptr<scalar_t>(), y_buffer.data_ptr<scalar_t>(),
-                s_buffer.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
-                x_0.data_ptr<scalar_t>(), grad_0.data_ptr<scalar_t>(),
-                grad_q.data_ptr<scalar_t>(), batch_size, m, v_dim);
-      }));
+    y_buffer.scalar_type(), "lbfgs_update_cu", ([&] {
+    lbfgs_update_buffer_kernel<scalar_t, scalar_t>
+      << < blocksPerGrid, threadsPerBlock,
+      v_dim * sizeof(y_buffer.scalar_type()), stream >> > (
+      rho_buffer.data_ptr<scalar_t>(), y_buffer.data_ptr<scalar_t>(),
+      s_buffer.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
+      x_0.data_ptr<scalar_t>(), grad_0.data_ptr<scalar_t>(),
+      grad_q.data_ptr<scalar_t>(), batch_size, m, v_dim);
+  }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {rho_buffer, y_buffer, s_buffer, x_0, grad_0};
+  return { rho_buffer, y_buffer, s_buffer, x_0, grad_0 };
 }
 
 std::vector<torch::Tensor>
@@ -791,7 +925,8 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
                 torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
                 torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
                 const float epsilon, const int batch_size, const int m,
-                const int v_dim, const bool stable_mode) {
+                const int v_dim, const bool stable_mode)
+{
   using namespace Curobo::Optimization;
 
   // call first kernel:
@@ -800,70 +935,73 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
   assert(threadsPerBlock < 1024);
   assert(m < M_MAX);
   int blocksPerGrid =
-      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+    batch_size; // ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  int smemsize = 0;
-  if (true) {
-    AT_DISPATCH_FLOATING_TYPES(
-        y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] {
-          smemsize = m * threadsPerBlock * sizeof(scalar_t);
-          lbfgs_update_buffer_and_step<scalar_t, scalar_t, false>
-              <<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
-                  step_vec.data_ptr<scalar_t>(),
-                  rho_buffer.data_ptr<scalar_t>(),
-                  y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
-                  q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
-                  grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
-                  epsilon, batch_size, m, v_dim, stable_mode);
-        });
-  } else {
+  int smemsize        = 0;
 
+  if (true)
+  {
+    AT_DISPATCH_FLOATING_TYPES(
+      y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] {
+      smemsize = m * threadsPerBlock * sizeof(scalar_t);
+      lbfgs_update_buffer_and_step<scalar_t, scalar_t, false>
+        << < blocksPerGrid, threadsPerBlock, smemsize, stream >> > (
+        step_vec.data_ptr<scalar_t>(),
+        rho_buffer.data_ptr<scalar_t>(),
+        y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+        q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
+        grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
+        epsilon, batch_size, m, v_dim, stable_mode);
+    });
+  }
+  else
+  {
     // v1 does not work
     AT_DISPATCH_FLOATING_TYPES(
-        y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] {
-          smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) +
-                     m * batch_size * sizeof(scalar_t);
-          lbfgs_update_buffer_and_step_v1<scalar_t, scalar_t, false>
-              <<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
-                  step_vec.data_ptr<scalar_t>(),
-                  rho_buffer.data_ptr<scalar_t>(),
-                  y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
-                  q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
-                  grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
-                  epsilon, batch_size, m, v_dim, stable_mode);
-        });
+      y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] {
+      smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) +
+                 m * batch_size * sizeof(scalar_t);
+      lbfgs_update_buffer_and_step_v1<scalar_t, scalar_t, false>
+        << < blocksPerGrid, threadsPerBlock, smemsize, stream >> > (
+        step_vec.data_ptr<scalar_t>(),
+        rho_buffer.data_ptr<scalar_t>(),
+        y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+        q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
+        grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
+        epsilon, batch_size, m, v_dim, stable_mode);
+    });
   }
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0};
+  return { step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0 };
 }
 
-std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
-                                       torch::Tensor rho_buffer,
-                                       torch::Tensor sum, const int batch_size,
-                                       const int m, const int v_dim) {
-
+std::vector<torch::Tensor>reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
+                                      torch::Tensor rho_buffer,
+                                      torch::Tensor sum, const int batch_size,
+                                      const int m, const int v_dim)
+{
   using namespace Curobo::Optimization;
 
   int threadsPerBlock = pow(2, ((int)log2(v_dim)) + 1);
-  int blocksPerGrid =
-      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
-                  // printf("threadsPerBlock:%d, blocksPerGrid: %d\n",
-                  // threadsPerBlock, blocksPerGrid);
+  int blocksPerGrid   =
+    batch_size; // ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+                // printf("threadsPerBlock:%d, blocksPerGrid: %d\n",
+                // threadsPerBlock, blocksPerGrid);
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      vec.scalar_type(), "reduce_cu", ([&] {
-        reduce_kernel<scalar_t, scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                vec.data_ptr<scalar_t>(), vec2.data_ptr<scalar_t>(),
-                rho_buffer.data_ptr<scalar_t>(), sum.data_ptr<scalar_t>(),
-                batch_size, m, v_dim);
-      }));
+    vec.scalar_type(), "reduce_cu", ([&] {
+    reduce_kernel<scalar_t, scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+      vec.data_ptr<scalar_t>(), vec2.data_ptr<scalar_t>(),
+      rho_buffer.data_ptr<scalar_t>(), sum.data_ptr<scalar_t>(),
+      batch_size, m, v_dim);
+  }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {sum, rho_buffer};
+  return { sum, rho_buffer };
 }
diff --git a/src/curobo/curobolib/cpp/line_search_cuda.cpp b/src/curobo/curobolib/cpp/line_search_cuda.cpp
index 1a88678..d038cd7 100644
--- a/src/curobo/curobolib/cpp/line_search_cuda.cpp
+++ b/src/curobo/curobolib/cpp/line_search_cuda.cpp
@@ -18,49 +18,67 @@
 // CUDA forward declarations
 
 std::vector<torch::Tensor>
-update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
-                 torch::Tensor best_iteration, 
-                 torch::Tensor current_iteration,
+update_best_cuda(torch::Tensor       best_cost,
+                 torch::Tensor       best_q,
+                 torch::Tensor       best_iteration,
+                 torch::Tensor       current_iteration,
                  const torch::Tensor cost,
-                 const torch::Tensor q, const int d_opt, const int cost_s1,
-                 const int cost_s2, const int iteration,
-                 const float delta_threshold,
-                 const float relative_threshold = 0.999);
+                 const torch::Tensor q,
+                 const int           d_opt,
+                 const int           cost_s1,
+                 const int           cost_s2,
+                 const int           iteration,
+                 const float         delta_threshold,
+                 const float         relative_threshold = 0.999);
+
+std::vector<torch::Tensor>line_search_cuda(
+
+  // torch::Tensor m,
+  torch::Tensor       best_x,
+  torch::Tensor       best_c,
+  torch::Tensor       best_grad,
+  const torch::Tensor g_x,
+  const torch::Tensor x_set,
+  const torch::Tensor step_vec,
+  const torch::Tensor c_0,
+  const torch::Tensor alpha_list,
+  const torch::Tensor c_idx,
+  const float         c_1,
+  const float         c_2,
+  const bool          strong_wolfe,
+  const bool          approx_wolfe,
+  const int           l1,
+  const int           l2,
+  const int           batchsize);
 
-std::vector<torch::Tensor> line_search_cuda(
-    // torch::Tensor m,
-    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
-    const torch::Tensor g_x, const torch::Tensor x_set,
-    const torch::Tensor step_vec, const torch::Tensor c_0,
-    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
-    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
-    const int l1, const int l2, const int batchsize);
 // C++ interface
 
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
 
 
-std::vector<torch::Tensor> line_search_call(
-    // torch::Tensor m,
-    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
-    const torch::Tensor g_x, const torch::Tensor x_set,
-    const torch::Tensor step_vec, const torch::Tensor c_0,
-    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
-    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
-    const int l1, const int l2, const int batchsize) {
+std::vector<torch::Tensor>line_search_call(
 
+  // torch::Tensor m,
+  torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+  const torch::Tensor g_x, const torch::Tensor x_set,
+  const torch::Tensor step_vec, const torch::Tensor c_0,
+  const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+  const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+  const int l1, const int l2, const int batchsize)
+{
   CHECK_INPUT(g_x);
   CHECK_INPUT(x_set);
   CHECK_INPUT(step_vec);
   CHECK_INPUT(c_0);
   CHECK_INPUT(alpha_list);
   CHECK_INPUT(c_idx);
+
   // CHECK_INPUT(m);
   CHECK_INPUT(best_x);
   CHECK_INPUT(best_c);
@@ -76,14 +94,14 @@ std::vector<torch::Tensor> line_search_call(
 
 std::vector<torch::Tensor>
 update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
-                 torch::Tensor best_iteration, 
+                 torch::Tensor best_iteration,
                  torch::Tensor current_iteration,
                  const torch::Tensor cost,
                  const torch::Tensor q, const int d_opt, const int cost_s1,
                  const int cost_s2, const int iteration,
                  const float delta_threshold,
-                 const float relative_threshold=0.999) {
-
+                 const float relative_threshold = 0.999)
+{
   CHECK_INPUT(best_cost);
   CHECK_INPUT(best_q);
   CHECK_INPUT(cost);
@@ -96,8 +114,8 @@ update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
                           cost_s1, cost_s2, iteration, delta_threshold, relative_threshold);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
   m.def("update_best", &update_best_call, "Update Best (CUDA)");
   m.def("line_search", &line_search_call, "Line search (CUDA)");
-
-}
\ No newline at end of file
+}
diff --git a/src/curobo/curobolib/cpp/line_search_kernel.cu b/src/curobo/curobolib/cpp/line_search_kernel.cu
index 0c42029..ff061a1 100644
--- a/src/curobo/curobolib/cpp/line_search_kernel.cu
+++ b/src/curobo/curobolib/cpp/line_search_kernel.cu
@@ -37,362 +37,431 @@
 
 #define FULL_MASK 0xffffffff
 
-namespace Curobo {
+namespace Curobo
+{
+  namespace Optimization
+  {
+    template<typename scalar_t, typename psum_t>
+    __inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
+                                      psum_t *data, scalar_t *result)
+    {
+      psum_t val = v;
 
-namespace Optimization {
+      val += __shfl_down_sync(mask, val, 1);
+      val += __shfl_down_sync(mask, val, 2);
+      val += __shfl_down_sync(mask, val, 4);
+      val += __shfl_down_sync(mask, val, 8);
+      val += __shfl_down_sync(mask, val, 16);
 
-template <typename scalar_t, typename psum_t>
-__inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
-                                  psum_t *data, scalar_t *result) {
-  psum_t val = v;
-  val += __shfl_down_sync(mask, val, 1);
-  val += __shfl_down_sync(mask, val, 2);
-  val += __shfl_down_sync(mask, val, 4);
-  val += __shfl_down_sync(mask, val, 8);
-  val += __shfl_down_sync(mask, val, 16);
-  // int leader = __ffs(mask) – 1;    // select a leader lane
-  int leader = 0;
-  if (threadIdx.x % 32 == leader) {
-    if (m <= 32) {
-      result[0] = (scalar_t)val;
-    } else {
-      data[(threadIdx.x + 1) / 32] = val;
-    }
-  }
-  if (m > 32) {
-
-    __syncthreads();
-
-    int elems = (m + 31) / 32;
-    assert(elems <= 32);
-    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
-    if (threadIdx.x < elems) { // only the first warp will do this work
-      psum_t val2 = data[threadIdx.x % 32];
-      int shift = 1;
-      for (int i = elems - 1; i > 0; i /= 2) {
-        val2 += __shfl_down_sync(mask2, val2, shift);
-        shift *= 2;
-      }
-      // int leader = __ffs(mask2) – 1;    // select a leader lane
+      // int leader = __ffs(mask) – 1;    // select a leader lane
       int leader = 0;
-      if (threadIdx.x % 32 == leader) {
-        result[0] = (scalar_t)val2;
+
+      if (threadIdx.x % 32 == leader)
+      {
+        if (m <= 32)
+        {
+          result[0] = (scalar_t)val;
+        }
+        else
+        {
+          data[(threadIdx.x + 1) / 32] = val;
+        }
+      }
+
+      if (m > 32)
+      {
+        __syncthreads();
+
+        int elems = (m + 31) / 32;
+        assert(elems <= 32);
+        unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+
+        if (threadIdx.x < elems) // only the first warp will do this work
+        {
+          psum_t val2  = data[threadIdx.x % 32];
+          int    shift = 1;
+
+          for (int i = elems - 1; i > 0; i /= 2)
+          {
+            val2  += __shfl_down_sync(mask2, val2, shift);
+            shift *= 2;
+          }
+
+          // int leader = __ffs(mask2) – 1;    // select a leader lane
+          int leader = 0;
+
+          if (threadIdx.x % 32 == leader)
+          {
+            result[0] = (scalar_t)val2;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+    // Launched with l2 threads/block and batchsize blocks
+    template<typename scalar_t, typename psum_t>
+    __global__ void line_search_kernel(
+
+      // int64_t *m_idx, // 4x1x1
+      scalar_t *best_x,           // 4x280
+      scalar_t *best_c,           // 4x1
+      scalar_t *best_grad,        // 4x280
+      const scalar_t *g_x,        //  4x6x280
+      const scalar_t *x_set,      //  4x6x280
+      const scalar_t *step_vec,   // 4x280x1
+      const scalar_t *c,          // 4x6x1
+      const scalar_t *alpha_list, // 4x6x1
+      const int64_t *c_idx,       // 4x1x1
+      const float c_1, const float c_2, const bool strong_wolfe,
+      const bool approx_wolfe,
+      const int l1,               // 6
+      const int l2,               // 280
+      const int batchsize)        // 4
+    {
+      int batch = blockIdx.x;
+      __shared__ psum_t   data[32];
+      __shared__ scalar_t result[32];
+
+      assert(l1 <= 32);
+      unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+      if (threadIdx.x >= l2)
+      {
+        return;
+      }
+
+      scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+      // g_step = g0 @ step_vec_T
+      // g_x @ step_vec_T
+      for (int i = 0; i < l1; i++)
+      {
+        reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+               &data[0], &result[i]);
+      }
+
+      __shared__ scalar_t step_success[32];
+      __shared__ scalar_t step_success_w1[32];
+      assert(blockDim.x >= l1);
+      bool wolfe_1   = false;
+      bool wolfe     = false;
+      bool condition = threadIdx.x < l1;
+
+      if (condition)
+      {
+        // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+        scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+        // condition 1:
+        wolfe_1 = c[batch * l1 + threadIdx.x] <=
+                  (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+        // condition 2:
+        bool wolfe_2;
+
+        if (strong_wolfe)
+        {
+          wolfe_2 = abs(result[threadIdx.x]) <= c_2 *abs(result[0]);
+        }
+        else
+        {
+          wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+        }
+
+        wolfe = wolfe_1 & wolfe_2;
+
+        step_success[threadIdx.x]    = wolfe * (alpha_list_elem + 0.1);
+        step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+      }
+
+      __syncthreads();
+
+      __shared__ int idx_shared;
+
+      if (threadIdx.x == 0)
+      {
+        int m_id      = 0;
+        int m1_id     = 0;
+        scalar_t max1 = step_success[0];
+        scalar_t max2 = step_success_w1[0];
+
+        for (int i = 1; i < l1; i++)
+        {
+          if (max1 < step_success[i])
+          {
+            max1 = step_success[i];
+            m_id = i;
+          }
+
+          if (max2 < step_success_w1[i])
+          {
+            max2  = step_success_w1[i];
+            m1_id = i;
+          }
+        }
+
+        if (!approx_wolfe)
+        {
+          //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+          if (m_id == 0)
+          {
+            m_id = m1_id;
+          }
+
+          // m_idx[m_idx == 0] = 1
+          if (m_id == 0)
+          {
+            m_id = 1;
+          }
+        }
+        idx_shared = m_id + c_idx[batch];
+      }
+
+      ////////////////////////////////////
+      // write outputs using the computed index.
+      // one index per batch is computed
+      ////////////////////////////////////
+      // l2 is d_opt, l1 is line_search n.
+      // idx_shared contains index in l1
+      //
+      __syncthreads();
+
+      if (threadIdx.x < l2)
+      {
+        if (threadIdx.x == 0)
+        {
+          // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+        }
+        best_x[batch * l2 + threadIdx.x]    = x_set[idx_shared * l2 + threadIdx.x];
+        best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+      }
+
+      if (threadIdx.x == 0)
+      {
+        best_c[batch] = c[idx_shared];
       }
     }
-  }
-  __syncthreads();
-}
 
-// Launched with l2 threads/block and batchsize blocks
-template <typename scalar_t, typename psum_t>
-__global__ void line_search_kernel(
-    // int64_t *m_idx, // 4x1x1
-    scalar_t *best_x,           // 4x280
-    scalar_t *best_c,           // 4x1
-    scalar_t *best_grad,        // 4x280
-    const scalar_t *g_x,        //  4x6x280
-    const scalar_t *x_set,      //  4x6x280
-    const scalar_t *step_vec,   // 4x280x1
-    const scalar_t *c,          // 4x6x1
-    const scalar_t *alpha_list, // 4x6x1
-    const int64_t *c_idx,       // 4x1x1
-    const float c_1, const float c_2, const bool strong_wolfe,
-    const bool approx_wolfe,
-    const int l1,        // 6
-    const int l2,        // 280
-    const int batchsize) // 4
+    // Launched with l2 threads/block and #blocks = batchsize
+    template<typename scalar_t, typename psum_t>
+    __global__ void line_search_kernel_mask(
+
+      // int64_t *m_idx, // 4x1x1
+      scalar_t *best_x,           // 4x280
+      scalar_t *best_c,           // 4x1
+      scalar_t *best_grad,        // 4x280
+      const scalar_t *g_x,        //  4x6x280
+      const scalar_t *x_set,      //  4x6x280
+      const scalar_t *step_vec,   // 4x280x1
+      const scalar_t *c,          // 4x6x1
+      const scalar_t *alpha_list, // 4x6x1
+      const int64_t *c_idx,       // 4x1x1
+      const float c_1, const float c_2, const bool strong_wolfe,
+      const bool approx_wolfe,
+      const int l1,               // 6
+      const int l2,               // 280
+      const int batchsize)        // 4
+    {
+      int batch = blockIdx.x;
+      __shared__ psum_t   data[32];
+      __shared__ scalar_t result[32];
+
+      assert(l1 <= 32);
+      unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+      if (threadIdx.x >= l2)
+      {
+        return;
+      }
+
+      scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+      // g_step = g0 @ step_vec_T
+      // g_x @ step_vec_T
+      for (int i = 0; i < l1; i++)
+      {
+        reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+               &data[0], &result[i]);
+      }
+
+      // __shared__ scalar_t step_success[32];
+      // __shared__ scalar_t step_success_w1[32];
+      assert(blockDim.x >= l1);
+      bool wolfe_1   = false;
+      bool wolfe     = false;
+      bool condition = threadIdx.x < l1;
+
+      if (condition)
+      {
+        scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+        // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+
+        // condition 1:
+        wolfe_1 = c[batch * l1 + threadIdx.x] <=
+                  (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+        // condition 2:
+        bool wolfe_2;
+
+        if (strong_wolfe)
+        {
+          wolfe_2 = abs(result[threadIdx.x]) <= c_2 *abs(result[0]);
+        }
+        else
+        {
+          wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+        }
+
+        // wolfe = torch.logical_and(wolfe_1, wolfe_2)
+        wolfe = wolfe_1 & wolfe_2;
+
+        // // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
+        // // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
+        // step_success[threadIdx.x]    = wolfe   * (alpha_list_elem + 0.1);
+        // step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+      }
+      unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
+      unsigned msk  = __ballot_sync(FULL_MASK, wolfe & condition);
+
+      // get the index of the last occurance of true
+      unsigned msk1_brev = __brev(msk1);
+      unsigned msk_brev  = __brev(msk);
+
+      int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
+      int id  = 32 - __ffs(msk_brev);  // position of least signficant bit set to 1
+
+      __syncthreads();
+
+      __shared__ int idx_shared;
+
+      if (threadIdx.x == 0)
+      {
+        if (!approx_wolfe)
+        {
+          if (id == 32) // msk is zero
+          {
+            id = id1;
+          }
+
+          if (id == 0) // bit 0 is set
+          {
+            id = id1;
+          }
+
+          if (id == 32) // msk is zero
+          {
+            id = 1;
+          }
+
+          if (id == 0)
+          {
+            id = 1;
+          }
+        }
+        else
+        {
+          if (id == 32) // msk is zero
+          {
+            id = 0;
+          }
+        }
+
+        // //  _, m_idx = torch.max(step_success, dim=-2)
+        // //  _, m1_idx = torch.max(step_success_w1, dim=-2)
+        // int m_id = 0;
+        // int m1_id = 0;
+        // scalar_t max1 = step_success[0];
+        // scalar_t max2 = step_success_w1[0];
+        // for (int i=1; i<l1; i++) {
+        //   if (max1<step_success[i]) {
+        //     max1 = step_success[i];
+        //     m_id = i;
+        //   }
+        //   if (max2<step_success_w1[i]) {
+        //     max2 = step_success_w1[i];
+        //     m1_id = i;
+        //   }
+        // }
+
+        // //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+        // if (m_id == 0) {
+        //     m_id = m1_id;
+        // }
+
+        // // m_idx[m_idx == 0] = 1
+        // if (m_id == 0) {
+        //     m_id = 1;
+        // }
+
+        // if (id != m_id) {
+        //   printf("id=%d, m_id=%d\n", id, m_id);
+        //   printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
+        //   32-__ffs(msk1_brev), 32-__ffs(msk_brev));
+        // }
+
+        // m_idx[batch] = m_id;
+        //  m_idx[batch] = id;
+        idx_shared = id + c_idx[batch];
+      }
+
+      ////////////////////////////////////
+      // write outputs using the computed index.
+      // one index per batch is computed
+      ////////////////////////////////////
+      __syncthreads();
+
+      if (threadIdx.x < l2)
+      {
+        if (threadIdx.x == 0)
+        {
+          // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+        }
+        best_x[batch * l2 + threadIdx.x]    = x_set[idx_shared * l2 + threadIdx.x];
+        best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+      }
+
+      if (threadIdx.x == 0)
+      {
+        best_c[batch] = c[idx_shared];
+      }
+    }
+  } // namespace Optimization
+}   // namespace Curobo
+std::vector<torch::Tensor>line_search_cuda(
+
+  // torch::Tensor m_idx,
+  torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+  const torch::Tensor g_x, const torch::Tensor x_set,
+  const torch::Tensor step_vec, const torch::Tensor c_0,
+  const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+  const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+  const int l1, const int l2, const int batchsize)
 {
-
-  int batch = blockIdx.x;
-  __shared__ psum_t data[32];
-  __shared__ scalar_t result[32];
-  assert(l1 <= 32);
-  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
-
-  if (threadIdx.x >= l2) {
-    return;
-  }
-
-  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
-
-  // g_step = g0 @ step_vec_T
-  // g_x @ step_vec_T
-  for (int i = 0; i < l1; i++) {
-    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
-           &data[0], &result[i]);
-  }
-
-  __shared__ scalar_t step_success[32];
-  __shared__ scalar_t step_success_w1[32];
-  assert(blockDim.x >= l1);
-  bool wolfe_1 = false;
-  bool wolfe = false;
-  bool condition = threadIdx.x < l1;
-  if (condition) {
-    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
-    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
-
-    // condition 1:
-    wolfe_1 = c[batch * l1 + threadIdx.x] <=
-              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
-
-    // condition 2:
-    bool wolfe_2;
-    if (strong_wolfe) {
-      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
-    } else {
-      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
-    }
-
-    wolfe = wolfe_1 & wolfe_2;
-
-    step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
-    step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
-  }
-
-  __syncthreads();
-
-  __shared__ int idx_shared;
-  if (threadIdx.x == 0) {
-    int m_id = 0;
-    int m1_id = 0;
-    scalar_t max1 = step_success[0];
-    scalar_t max2 = step_success_w1[0];
-    for (int i = 1; i < l1; i++) {
-      if (max1 < step_success[i]) {
-        max1 = step_success[i];
-        m_id = i;
-      }
-      if (max2 < step_success_w1[i]) {
-        max2 = step_success_w1[i];
-        m1_id = i;
-      }
-    }
-
-    if (!approx_wolfe) {
-
-      //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
-      if (m_id == 0) {
-        m_id = m1_id;
-      }
-      // m_idx[m_idx == 0] = 1
-      if (m_id == 0) {
-        m_id = 1;
-      }
-    }
-    idx_shared = m_id + c_idx[batch];
-  }
-
-  ////////////////////////////////////
-  // write outputs using the computed index.
-  // one index per batch is computed
-  ////////////////////////////////////
-  // l2 is d_opt, l1 is line_search n.
-  // idx_shared contains index in l1
-  //
-  __syncthreads();
-  if (threadIdx.x < l2) {
-    if (threadIdx.x == 0) {
-      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
-    }
-    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
-    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
-  }
-  if (threadIdx.x == 0) {
-    best_c[batch] = c[idx_shared];
-  }
-}
-
-// Launched with l2 threads/block and #blocks = batchsize
-template <typename scalar_t, typename psum_t>
-__global__ void line_search_kernel_mask(
-    // int64_t *m_idx, // 4x1x1
-    scalar_t *best_x,           // 4x280
-    scalar_t *best_c,           // 4x1
-    scalar_t *best_grad,        // 4x280
-    const scalar_t *g_x,        //  4x6x280
-    const scalar_t *x_set,      //  4x6x280
-    const scalar_t *step_vec,   // 4x280x1
-    const scalar_t *c,          // 4x6x1
-    const scalar_t *alpha_list, // 4x6x1
-    const int64_t *c_idx,       // 4x1x1
-    const float c_1, const float c_2, const bool strong_wolfe,
-    const bool approx_wolfe,
-    const int l1,        // 6
-    const int l2,        // 280
-    const int batchsize) // 4
-{
-
-  int batch = blockIdx.x;
-  __shared__ psum_t data[32];
-  __shared__ scalar_t result[32];
-  assert(l1 <= 32);
-  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
-
-  if (threadIdx.x >= l2) {
-    return;
-  }
-
-  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
-
-  // g_step = g0 @ step_vec_T
-  // g_x @ step_vec_T
-  for (int i = 0; i < l1; i++) {
-    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
-           &data[0], &result[i]);
-  }
-
-  // __shared__ scalar_t step_success[32];
-  // __shared__ scalar_t step_success_w1[32];
-  assert(blockDim.x >= l1);
-  bool wolfe_1 = false;
-  bool wolfe = false;
-  bool condition = threadIdx.x < l1;
-  if (condition) {
-    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
-
-    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
-
-    // condition 1:
-    wolfe_1 = c[batch * l1 + threadIdx.x] <=
-              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
-
-    // condition 2:
-    bool wolfe_2;
-    if (strong_wolfe) {
-      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
-    } else {
-      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
-    }
-
-    // wolfe = torch.logical_and(wolfe_1, wolfe_2)
-    wolfe = wolfe_1 & wolfe_2;
-
-    // // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
-    // // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
-    // step_success[threadIdx.x]    = wolfe   * (alpha_list_elem + 0.1);
-    // step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
-  }
-  unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
-  unsigned msk = __ballot_sync(FULL_MASK, wolfe & condition);
-
-  // get the index of the last occurance of true
-  unsigned msk1_brev = __brev(msk1);
-  unsigned msk_brev = __brev(msk);
-
-  int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
-  int id = 32 - __ffs(msk_brev);   // position of least signficant bit set to 1
-
-  __syncthreads();
-
-  __shared__ int idx_shared;
-  if (threadIdx.x == 0) {
-    if (!approx_wolfe) {
-      if (id == 32) { // msk is zero
-        id = id1;
-      }
-      if (id == 0) { // bit 0 is set
-        id = id1;
-      }
-      if (id == 32) { // msk is zero
-        id = 1;
-      }
-      if (id == 0) {
-        id = 1;
-      }
-    } else {
-      if (id == 32) { // msk is zero
-        id = 0;
-      }
-    }
-
-    // //  _, m_idx = torch.max(step_success, dim=-2)
-    // //  _, m1_idx = torch.max(step_success_w1, dim=-2)
-    // int m_id = 0;
-    // int m1_id = 0;
-    // scalar_t max1 = step_success[0];
-    // scalar_t max2 = step_success_w1[0];
-    // for (int i=1; i<l1; i++) {
-    //   if (max1<step_success[i]) {
-    //     max1 = step_success[i];
-    //     m_id = i;
-    //   }
-    //   if (max2<step_success_w1[i]) {
-    //     max2 = step_success_w1[i];
-    //     m1_id = i;
-    //   }
-    // }
-
-    // //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
-    // if (m_id == 0) {
-    //     m_id = m1_id;
-    // }
-
-    // // m_idx[m_idx == 0] = 1
-    // if (m_id == 0) {
-    //     m_id = 1;
-    // }
-
-    // if (id != m_id) {
-    //   printf("id=%d, m_id=%d\n", id, m_id);
-    //   printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
-    //   32-__ffs(msk1_brev), 32-__ffs(msk_brev));
-    // }
-
-    // m_idx[batch] = m_id;
-    //  m_idx[batch] = id;
-    idx_shared = id + c_idx[batch];
-  }
-
-  ////////////////////////////////////
-  // write outputs using the computed index.
-  // one index per batch is computed
-  ////////////////////////////////////
-  __syncthreads();
-  if (threadIdx.x < l2) {
-    if (threadIdx.x == 0) {
-      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
-    }
-    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
-    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
-  }
-  if (threadIdx.x == 0) {
-    best_c[batch] = c[idx_shared];
-  }
-}
-
-} // namespace Optimization
-} // namespace Curobo
-std::vector<torch::Tensor> line_search_cuda(
-    // torch::Tensor m_idx,
-    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
-    const torch::Tensor g_x, const torch::Tensor x_set,
-    const torch::Tensor step_vec, const torch::Tensor c_0,
-    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
-    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
-    const int l1, const int l2, const int batchsize) {
   using namespace Curobo::Optimization;
   assert(l2 <= 1024);
 
   // multiple of 32
   const int threadsPerBlock = 32 * ((l2 + 31) / 32); //  l2;
-  const int blocksPerGrid = batchsize;
+  const int blocksPerGrid   = batchsize;
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      g_x.scalar_type(), "line_search_cu", ([&] {
-        line_search_kernel_mask<scalar_t, scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                // m_idx.data_ptr<int>(),
-                best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
-                best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
-                x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
-                c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
-                c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
-                l1, l2, batchsize);
-      }));
+    g_x.scalar_type(), "line_search_cu", ([&] {
+    line_search_kernel_mask<scalar_t, scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+
+      // m_idx.data_ptr<int>(),
+      best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
+      best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
+      x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
+      c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
+      c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
+      l1, l2, batchsize);
+  }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {best_x, best_c, best_grad};
-}
\ No newline at end of file
+  return { best_x, best_c, best_grad };
+}
diff --git a/src/curobo/curobolib/cpp/pose_distance_kernel.cu b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
index e50288c..5569aa1 100644
--- a/src/curobo/curobolib/cpp/pose_distance_kernel.cu
+++ b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
@@ -23,616 +23,646 @@
 #define GOALSET 2
 #define BATCH_GOALSET 3
 
-namespace Curobo 
+namespace Curobo
 {
-namespace Pose
-{
-__device__ __forceinline__ void
-transform_vec_quat(const float4 q, // x,y,z, qw, qx,qy,qz
-                   const float *d_vec_weight, 
-                   float *C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-
-  if (false || q.x != 0 || q.y != 0 || q.z != 0) {
-
-    C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
-          2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
-          2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
-          q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
-    C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
-          2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
-          q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
-          2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
-    C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
-          q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
-          q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
-          q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
-
-    C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
-          2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
-          2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
-          q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
-    C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] + 
-          2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
-          q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
-          2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
-    C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
-          q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
-          q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
-          q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
-  }
+  namespace Pose
   {
-    C[0] = d_vec_weight[0];
-    C[1] = d_vec_weight[1];
-    C[2] = d_vec_weight[2];
-    C[3] = d_vec_weight[3];
-    C[4] = d_vec_weight[4];
-    C[5] = d_vec_weight[5];
-    
-
-  }
-}
-
-__device__ __forceinline__ void
-inv_transform_vec_quat(const float4 q_in, // x,y,z, qw, qx,qy,qz
-                   const float *d_vec_weight, 
-                   float *C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-
-  if (q_in.x != 0 || q_in.y != 0 || q_in.z != 0) {
-    float4 q = make_float4(-q_in.x, -q_in.y, -q_in.z, q_in.w);
-
-    C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
-          2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
-          2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
-          q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
-    C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
-          2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
-          q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
-          2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
-    C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
-          q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
-          q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
-          q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
-
-    C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
-          2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
-          2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
-          q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
-    C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] + 
-          2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
-          q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
-          2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
-    C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
-          q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
-          q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
-          q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
-  }
-  {
-    C[0] = d_vec_weight[0];
-    C[1] = d_vec_weight[1];
-    C[2] = d_vec_weight[2];
-    C[3] = d_vec_weight[3];
-    C[4] = d_vec_weight[4];
-    C[5] = d_vec_weight[5];
-    
-
-  }
-}
-__device__ __forceinline__ void
-compute_quat_distance(float *result, const float4 a, const float4 b) {
-  // We compute distance with the conjugate of b
-  float r_w = (a.w * b.w + a.x * b.x + a.y * b.y + a.z * b.z);
-  if (r_w < 0.0) {
-    r_w = 1.0;
-  } else {
-    r_w = -1.0;
-  }
-
-  result[0] = r_w * (-1 * a.w * b.x + b.w * a.x - a.y * b.z + b.y * a.z);
-  result[1] = r_w * (-1 * a.w * b.y + b.w * a.y - a.z * b.x + b.z * a.x);
-  result[2] = r_w * (-1 * a.w * b.z + b.w * a.z - a.x * b.y + b.x * a.y);
-}
-
-__device__ __forceinline__ void
-compute_distance(float *distance_vec, float &distance, float &position_distance,
-                 float &rotation_distance, const float3 current_position,
-                 const float3 goal_position, const float4 current_quat,
-                 const float4 goal_quat, const float *vec_weight,
-                 const float *vec_convergence, const float position_weight,
-                 const float rotation_weight) {
-  compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
-  // compute position distance
-  *(float3 *)&distance_vec[0] = current_position - goal_position;
-  // distance_vec[0] = goal_position
-  position_distance = 0;
-  rotation_distance = 0;
-// scale by vec weight and position weight:
-#pragma unroll 3
-  for (int i = 0; i < 3; i++) {
-    distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
-    position_distance += distance_vec[i] * distance_vec[i];
-  }
-#pragma unroll 3
-  for (int i = 3; i < 6; i++) {
-    distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
-    rotation_distance += distance_vec[i] * distance_vec[i];
-  }
-
-  distance = 0;
-  if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
-    rotation_distance = sqrtf(rotation_distance);
-    //rotation_distance -= vec_convergence[0];
-  
-    distance += rotation_weight * rotation_distance;
-  }
-  if (position_distance > vec_convergence[1] * vec_convergence[1]) {
-    position_distance = sqrtf(position_distance);
-    //position_distance -= vec_convergence[1];
-  
-    distance += position_weight * position_distance;
-  }
-}
-
-__device__ __forceinline__ void compute_metric_distance(
-    float *distance_vec, float &distance, float &position_distance,
-    float &rotation_distance, const float3 current_position,
-    const float3 goal_position, const float4 current_quat,
-    const float4 goal_quat, const float *vec_weight,
-    const float *vec_convergence, const float position_weight,
-    const float p_alpha, const float rotation_weight, const float r_alpha) {
-  compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
-  // compute position distance
-  *(float3 *)&distance_vec[0] = current_position - goal_position;
-  // distance_vec[0] = goal_position
-  position_distance = 0;
-  rotation_distance = 0;
-// scale by vec weight and position weight:
-#pragma unroll 3
-  for (int i = 0; i < 3; i++) {
-    distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
-    // position_distance += powf(distance_vec[i],2);
-    position_distance += distance_vec[i] * distance_vec[i];
-  }
-#pragma unroll 3
-  for (int i = 3; i < 6; i++) {
-    distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
-    // rotation_distance += powf(distance_vec[i],2);
-    rotation_distance += distance_vec[i] * distance_vec[i];
-  }
-
-  distance = 0;
-  if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
-    rotation_distance = sqrtf(rotation_distance);
-    //rotation_distance -= vec_convergence[0];
-  
-    distance += rotation_weight * log2f(coshf(r_alpha * rotation_distance));
-  }
-  if (position_distance > vec_convergence[1] * vec_convergence[1]) {
-    //position_distance -= vec_convergence[1];
-    position_distance = sqrtf(position_distance);
-    distance += position_weight * log2f(coshf(p_alpha * position_distance));
-  }
-}
-
-template <typename scalar_t>
-__global__ void
-backward_pose_distance_kernel(scalar_t *out_grad_p,            // [b,3]
-                              scalar_t *out_grad_q,            // [b,4]
-                              const scalar_t *grad_distance,   // [b,1]
-                              const scalar_t *grad_p_distance, // [b,1]
-                              const scalar_t *grad_q_distance, // [b,1]
-                              const scalar_t *pose_weight,     // [2]
-                              const scalar_t *grad_p_vec,      // [b,3]
-                              const scalar_t *grad_q_vec,      // [b,4]
-                              const int batch_size) {
-  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (batch_idx >= batch_size) {
-    return;
-  }
-  // read data
-  const float g_distance = grad_distance[batch_idx];
-  const float2 p_weight = *(float2 *)&pose_weight[0];
-  const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
-  const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
-  const float g_p_distance = grad_p_distance[batch_idx];
-  const float g_q_distance = grad_q_distance[batch_idx];
-
-  // compute position gradient
-  float3 g_p =
-      (g_p_v) * ((g_p_distance + g_distance * p_weight.y)); // scalar * float3
-  float3 g_q =
-      (g_q_v) * ((g_q_distance + g_distance * p_weight.x)); // scalar * float3
-
-  // write out
-  *(float3 *)&out_grad_p[batch_idx * 3] = g_p;
-  *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
-}
-
-template <typename scalar_t>
-__global__ void backward_pose_kernel(scalar_t *out_grad_p,          // [b,3]
-                                     scalar_t *out_grad_q,          // [b,4]
-                                     const scalar_t *grad_distance, // [b,1]
-                                     const scalar_t *pose_weight,   // [2]
-                                     const scalar_t *grad_p_vec,    // [b,3]
-                                     const scalar_t *grad_q_vec,    // [b,4]
-                                     const int batch_size) {
-  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (batch_idx >= batch_size) {
-    return;
-  }
-  // read data
-  const float g_distance = grad_distance[batch_idx];
-  const float2 p_weight = *(float2 *)&pose_weight[0];
-  const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
-  const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
-
-  // compute position gradient
-  float3 g_p = (g_p_v) * ((g_distance * p_weight.y)); // scalar * float3
-  float3 g_q = (g_q_v) * ((g_distance * p_weight.x)); // scalar * float3
-
-  // write out
-  *(float3 *)&out_grad_p[batch_idx * 3] = g_p;
-  *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
-}
-
-template <typename scalar_t, bool write_distance>
-__global__ void goalset_pose_distance_kernel(
-    scalar_t *out_distance, scalar_t *out_position_distance,
-    scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
-    int32_t *out_gidx, const scalar_t *current_position,
-    const scalar_t *goal_position, const scalar_t *current_quat,
-    const scalar_t *goal_quat, const scalar_t *vec_weight,
-    const scalar_t *weight, const scalar_t *vec_convergence,
-    const scalar_t *run_weight, const scalar_t *run_vec_weight,
-    const int32_t *batch_pose_idx, const int mode, const int num_goals,
-    const int batch_size, const int horizon, const bool write_grad = false) {
-  const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
-  const int batch_idx = t_idx / horizon;
-  const int h_idx = t_idx - (batch_idx * horizon);
-  if (batch_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
-
-  // read current pose:
-  float3 position =
-      *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
-  float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
-  float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
-
-  // read weights:
-
-  float position_weight = weight[1];
-  float rotation_weight = weight[0];
-  if (!write_distance) {
-    position_weight *= run_weight[h_idx];
-    rotation_weight *= run_weight[h_idx];
-    if (position_weight == 0.0 && rotation_weight == 0.0) {
-      return;
+    __device__ __forceinline__ void
+    transform_error_quat(const float4 q, // x,y,z, qw, qx,qy,qz
+                         const float3 error,
+                         float       *result)
+    {
+      // do dot product:
+      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
+      {
+        result[0] = q.w * q.w * error.x + 2 * q.y * q.w * error.z -
+                    2 * q.z * q.w * error.y + q.x * q.x * error.x +
+                    2 * q.y * q.x * error.y + 2 * q.z * q.x * error.z -
+                    q.z * q.z * error.x - q.y * q.y * error.x;
+        result[1] = 2 * q.x * q.y * error.x + q.y * q.y * error.y +
+                    2 * q.z * q.y * error.z + 2 * q.w * q.z * error.x -
+                    q.z * q.z * error.y + q.w * q.w * error.y -
+                    2 * q.x * q.w * error.z - q.x * q.x * error.y;
+        result[2] = 2 * q.x * q.z * error.x + 2 * q.y * q.z * error.y +
+                    q.z * q.z * error.z - 2 * q.w * q.y * error.x -
+                    q.y * q.y * error.z + 2 * q.w * q.x * error.y -
+                    q.x * q.x * error.z + q.w * q.w * error.z;
+      }
+      else
+      {
+        result[0] = error.x;
+        result[1] = error.y;
+        result[2] = error.z;
+      }
     }
-  }
 
-  float3 l_goal_position;
-  float4 l_goal_quat;
-  float distance_vec[6]; //  = {0.0};
-  float pose_distance = 0.0;
-  float position_distance = 0.0;
-  float rotation_distance = 0.0;
-  float best_distance = INFINITY;
-  float best_position_distance = 0.0;
-  float best_rotation_distance = 0.0;
-  float best_distance_vec[6] = {0.0};
-  float best_des_vec_weight[6] = {0.0};
-  float d_vec_convergence[2];
+    __device__ __forceinline__ void transform_point(const float3  frame_pos,
+                                                    const float4  frame_quat,
+                                                    const float3& point,
+                                                    float3      & transformed_point)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      const float4 q = frame_quat;
+      const float3 p = frame_pos;
 
-  *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
+      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
+      {
+        transformed_point.x = p.x + q.w * q.w * point.x + 2 * q.y * q.w * point.z -
+                              2 * q.z * q.w * point.y + q.x * q.x * point.x +
+                              2 * q.y * q.x * point.y + 2 * q.z * q.x * point.z -
+                              q.z * q.z * point.x - q.y * q.y * point.x;
+        transformed_point.y = p.y + 2 * q.x * q.y * point.x + q.y * q.y * point.y +
+                              2 * q.z * q.y * point.z + 2 * q.w * q.z * point.x -
+                              q.z * q.z * point.y + q.w * q.w * point.y - 2 * q.x * q.w * point.z -
+                              q.x * q.x * point.y;
+        transformed_point.z = p.z + 2 * q.x * q.z * point.x + 2 * q.y * q.z * point.y +
+                              q.z * q.z * point.z - 2 * q.w * q.y * point.x - q.y * q.y * point.z +
+                              2 * q.w * q.x * point.y - q.x * q.x * point.z + q.w * q.w * point.z;
+      }
+      else
+      {
+        transformed_point.x = p.x + point.x;
+        transformed_point.y = p.y + point.y;
+        transformed_point.z = p.z + point.z;
+      }
+    }
 
-  int best_idx = -1;
-  float d_vec_weight[6];
-  float des_vec_weight[6] = {0.0};
-  *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
-  *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
-  if (h_idx < horizon - 1) {
-    *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
-    *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
-  }
- 
+    __device__ __forceinline__ void inv_transform_point(const float3  frame_pos,
+                                                        const float4  frame_quat,
+                                                        const float3& point,
+                                                        float3      & transformed_point)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      float4 q = make_float4(-1 * frame_quat.x, -1 * frame_quat.y, -1 * frame_quat.z, frame_quat.w);
+      float3 p = make_float3(0, 0, 0);
 
-  // read offset
-  int offset = batch_pose_idx[batch_idx];
-  if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
-    offset = (offset)*num_goals;
-  }
+      transform_point(make_float3(0, 0, 0), q, frame_pos, p);
+      p = -1.0 * p;
 
-  for (int k = 0; k < num_goals; k++) {
+      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
+      {
+        transformed_point.x = p.x + q.w * q.w * point.x + 2 * q.y * q.w * point.z -
+                              2 * q.z * q.w * point.y + q.x * q.x * point.x +
+                              2 * q.y * q.x * point.y + 2 * q.z * q.x * point.z -
+                              q.z * q.z * point.x - q.y * q.y * point.x;
+        transformed_point.y = p.y + 2 * q.x * q.y * point.x + q.y * q.y * point.y +
+                              2 * q.z * q.y * point.z + 2 * q.w * q.z * point.x -
+                              q.z * q.z * point.y + q.w * q.w * point.y - 2 * q.x * q.w * point.z -
+                              q.x * q.x * point.y;
+        transformed_point.z = p.z + 2 * q.x * q.z * point.x + 2 * q.y * q.z * point.y +
+                              q.z * q.z * point.z - 2 * q.w * q.y * point.x - q.y * q.y * point.z +
+                              2 * q.w * q.x * point.y - q.x * q.x * point.z + q.w * q.w * point.z;
+      }
+      else
+      {
+        transformed_point.x = p.x + point.x;
+        transformed_point.y = p.y + point.y;
+        transformed_point.z = p.z + point.z;
+      }
+    }
 
-    l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
-    float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
-    l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
-      transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
+    __device__ __forceinline__ void inv_transform_quat(
+      const float4  frame_quat,
+      const float4& quat,
+      float4      & transformed_quat)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      float4 q = make_float4(-1 * frame_quat.x, -1 * frame_quat.y, -1 * frame_quat.z, frame_quat.w);
 
-    compute_distance(&distance_vec[0], pose_distance, position_distance,
-                     rotation_distance, position, l_goal_position, quat,
-                     l_goal_quat,
-                     &des_vec_weight[0],      //&l_vec_weight[0],
-                     &d_vec_convergence[0], //&l_vec_convergence[0],
-                     position_weight, rotation_weight);
-    if (pose_distance <= best_distance) {
-      best_idx = k;
-      best_distance = pose_distance;
-      best_position_distance = position_distance;
-      best_rotation_distance = rotation_distance;
-      if (write_grad) {
-        //inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
-#pragma unroll 6
-        for (int i = 0; i < 6; i++) {
-          best_distance_vec[i] = distance_vec[i];
-          best_des_vec_weight[i] = des_vec_weight[i];
+      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
+      {
+        // multiply quats together new_q = q * quat;
+        transformed_quat.w = q.w * quat.w - q.x * quat.x - q.y * quat.y - q.z * quat.z;
+        transformed_quat.x = q.w * quat.x + quat.w * q.x + q.y * quat.z - quat.y * q.z;
+        transformed_quat.y = q.w * quat.y + quat.w * q.y + q.z * quat.x - quat.z * q.x;
+        transformed_quat.z = q.w * quat.z + quat.w * q.z + q.x * quat.y - quat.x * q.y;
+      }
+      else
+      {
+        transformed_quat = quat;
+      }
+    }
+
+    template<bool project_distance>
+    __device__ __forceinline__ void
+    compute_pose_distance_vector(float       *result_vec,
+                                 const float3 goal_position,
+                                 const float4 goal_quat,
+                                 const float3 current_position,
+                                 const float4 current_quat,
+                                 const float *vec_weight,
+                                 const float3 offset_position,
+                                 const float3 offset_rotation,
+                                 const bool   reach_offset)
+    {
+      // project current position to goal frame:
+      float3 error_position = make_float3(0, 0, 0);
+
+
+      float3 error_quat = make_float3(0, 0, 0);
+
+      if (project_distance)
+      {
+        float4 projected_quat = make_float4(0, 0, 0, 0);
+
+        inv_transform_point(goal_position, goal_quat, current_position, error_position);
+
+        // project current quat to goal frame:
+        inv_transform_quat(goal_quat, current_quat, projected_quat);
+
+
+        float r_w = projected_quat.w;
+
+        if (r_w < 0.0)
+        {
+          r_w = -1.0;
+        }
+        else
+        {
+          r_w = 1.0;
+        }
+
+        error_quat.x = r_w * (projected_quat.x);
+        error_quat.y = r_w * (projected_quat.y);
+        error_quat.z = r_w * (projected_quat.z);
+      }
+      else
+      {
+        error_position = current_position - goal_position;
+
+
+        float r_w =
+          (goal_quat.w * current_quat.w + goal_quat.x * current_quat.x + goal_quat.y *
+           current_quat.y + goal_quat.z * current_quat.z);
+
+        if (r_w < 0.0)
+        {
+          r_w = 1.0;
+        }
+        else
+        {
+          r_w = -1.0;
+        }
+
+        error_quat.x =  r_w *
+                       (-1 * goal_quat.w * current_quat.x + current_quat.w * goal_quat.x -
+                        goal_quat.y *
+                        current_quat.z + current_quat.y * goal_quat.z);
+        error_quat.y =  r_w *
+                       (-1 * goal_quat.w * current_quat.y + current_quat.w * goal_quat.y -
+                        goal_quat.z *
+                        current_quat.x + current_quat.z * goal_quat.x);
+        error_quat.z =  r_w *
+                       (-1 * goal_quat.w * current_quat.z + current_quat.w * goal_quat.z -
+                        goal_quat.x *
+                        current_quat.y + current_quat.x * goal_quat.y);
+      }
+
+
+      if (reach_offset)
+      {
+        error_position = error_position + offset_position;
+        error_quat     = error_quat + offset_rotation;
+      }
+
+
+      error_position = (*(float3 *)&vec_weight[3]) * error_position;
+
+      error_quat = (*(float3 *)&vec_weight[0]) * error_quat;
+
+
+      // compute rotation distance:
+
+
+      if (project_distance)
+      {
+        // project this error back:
+
+        transform_error_quat(goal_quat, error_quat,     &result_vec[3]);
+
+        // projected distance back to world frame:
+        transform_error_quat(goal_quat, error_position, &result_vec[0]);
+      }
+      else
+      {
+        *(float3 *)&result_vec[0] = error_position;
+        *(float3 *)&result_vec[3] = error_quat;
+      }
+    }
+
+    template<bool project_distance, bool use_metric>
+    __device__ __forceinline__ void
+    compute_pose_distance(float *distance_vec, float& distance, float& position_distance,
+                          float& rotation_distance, const float3 current_position,
+                          const float3 goal_position, const float4 current_quat,
+                          const float4 goal_quat, const float *vec_weight,
+                          const float *vec_convergence, const float position_weight,
+                          const float rotation_weight,
+                          const float p_alpha,
+                          const float r_alpha,
+                          const float3 offset_position,
+                          const float3 offset_rotation,
+                          const bool reach_offset)
+    {
+      compute_pose_distance_vector<project_distance>(&distance_vec[0],
+                                                     goal_position,
+                                                     goal_quat,
+                                                     current_position,
+                                                     current_quat,
+                                                     &vec_weight[0],
+                                                     offset_position,
+                                                     offset_rotation,
+                                                     reach_offset);
+
+      position_distance = 0;
+      rotation_distance = 0;
+
+      // scale by vec weight and position weight:
+#pragma unroll 3
+
+      for (int i = 0; i < 3; i++)
+      {
+        position_distance += distance_vec[i] * distance_vec[i];
+      }
+#pragma unroll 3
+
+      for (int i = 3; i < 6; i++)
+      {
+        rotation_distance += distance_vec[i] * distance_vec[i];
+      }
+
+      distance = 0;
+
+      if (rotation_distance > vec_convergence[0] * vec_convergence[0])
+      {
+        rotation_distance = sqrtf(rotation_distance);
+
+        if (use_metric)
+        {
+          distance += rotation_weight * log2f(coshf(r_alpha * rotation_distance));
+        }
+        else
+        {
+          distance += rotation_weight * rotation_distance;
+        }
+      }
+
+      if (position_distance > vec_convergence[1] * vec_convergence[1])
+      {
+        position_distance = sqrtf(position_distance);
+
+        if (use_metric)
+        {
+          distance += position_weight * log2f(coshf(p_alpha * position_distance));
+        }
+        else
+        {
+          distance += position_weight * position_distance;
         }
       }
     }
-  }
-  // write out:
 
-  // write out pose distance:
-  out_distance[batch_idx * horizon + h_idx] = best_distance;
-  if (write_distance) {
-    if(position_weight == 0.0)
+    template<typename scalar_t>
+    __global__ void
+    backward_pose_distance_kernel(scalar_t       *out_grad_p,      // [b,3]
+                                  scalar_t       *out_grad_q,      // [b,4]
+                                  const scalar_t *grad_distance,   // [b,1]
+                                  const scalar_t *grad_p_distance, // [b,1]
+                                  const scalar_t *grad_q_distance, // [b,1]
+                                  const scalar_t *pose_weight,     // [2]
+                                  const scalar_t *grad_p_vec,      // [b,3]
+                                  const scalar_t *grad_q_vec,      // [b,4]
+                                  const int       batch_size)
     {
-      best_position_distance = 0.0;
+      const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+      if (batch_idx >= batch_size)
+      {
+        return;
+      }
+
+      // read data
+      const float  g_distance   = grad_distance[batch_idx];
+      const float2 p_weight     = *(float2 *)&pose_weight[0];
+      const float3 g_p_v        = *(float3 *)&grad_p_vec[batch_idx * 3];
+      const float3 g_q_v        = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
+      const float  g_p_distance = grad_p_distance[batch_idx];
+      const float  g_q_distance = grad_q_distance[batch_idx];
+
+      // compute position gradient
+      float3 g_p =
+        (g_p_v) * ((g_p_distance + g_distance * p_weight.y)); // scalar * float3
+      float3 g_q =
+        (g_q_v) * ((g_q_distance + g_distance * p_weight.x)); // scalar * float3
+
+      // write out
+      *(float3 *)&out_grad_p[batch_idx * 3]     = g_p;
+      *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
     }
-    if (rotation_weight == 0.0)
+
+    template<typename scalar_t>
+    __global__ void backward_pose_kernel(scalar_t       *out_grad_p,    // [b,3]
+                                         scalar_t       *out_grad_q,    // [b,4]
+                                         const scalar_t *grad_distance, // [b,1]
+                                         const scalar_t *pose_weight,   // [2]
+                                         const scalar_t *grad_p_vec,    // [b,3]
+                                         const scalar_t *grad_q_vec,    // [b,4]
+                                         const int       batch_size)
     {
-      best_rotation_distance = 0.0;
-    }
-    out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
-    out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
-  }
-  out_gidx[batch_idx * horizon + h_idx] = best_idx;
+      const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if (write_grad) {
-    if (write_distance) {
-      position_weight = 1;
-      rotation_weight = 1;
-    }
-    if (best_position_distance > 0) {
-      best_position_distance = (position_weight / best_position_distance);
+      if (batch_idx >= batch_size)
+      {
+        return;
+      }
 
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
-          best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
-          best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
-          best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
+      // read data
+      const float  g_distance = grad_distance[batch_idx];
+      const float2 p_weight   = *(float2 *)&pose_weight[0];
+      const float3 g_p_v      = *(float3 *)&grad_p_vec[batch_idx * 3];
+      const float3 g_q_v      = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
 
-    } else {
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
+      // compute position gradient
+      float3 g_p = (g_p_v) * ((g_distance * p_weight.y)); // scalar * float3
+      float3 g_q = (g_q_v) * ((g_distance * p_weight.x)); // scalar * float3
+
+      // write out
+      *(float3 *)&out_grad_p[batch_idx * 3]     = g_p;
+      *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
     }
 
-    if (best_rotation_distance > 0) {
-      best_rotation_distance = rotation_weight / best_rotation_distance;
+    template<typename scalar_t, bool write_distance, bool project_distance, bool use_metric>
+    __global__ void goalset_pose_distance_kernel(
+      scalar_t *out_distance, scalar_t *out_position_distance,
+      scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
+      int32_t *out_gidx, const scalar_t *current_position,
+      const scalar_t *goal_position, const scalar_t *current_quat,
+      const scalar_t *goal_quat, const scalar_t *vec_weight,
+      const scalar_t *weight, const scalar_t *vec_convergence,
+      const scalar_t *run_weight, const scalar_t *run_vec_weight,
+      const scalar_t *offset_waypoint,
+      const scalar_t *offset_tstep_fraction,
+      const int32_t *batch_pose_idx, const int mode, const int num_goals,
+      const int batch_size, const int horizon, const bool write_grad = false)
+    {
+      const int t_idx     = (blockDim.x * blockIdx.x + threadIdx.x);
+      const int batch_idx = t_idx / horizon;
+      const int h_idx     = t_idx - (batch_idx * horizon);
 
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
-          best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
-          best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
-          best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
-    } else {
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
-    }
-  }
-}
+      if ((batch_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
 
-template <typename scalar_t, bool write_distance>
-__global__ void goalset_pose_distance_metric_kernel(
-    scalar_t *out_distance, scalar_t *out_position_distance,
-    scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
-    int32_t *out_gidx, const scalar_t *current_position,
-    const scalar_t *goal_position, const scalar_t *current_quat,
-    const scalar_t *goal_quat, const scalar_t *vec_weight,
-    const scalar_t *weight, const scalar_t *vec_convergence,
-    const scalar_t *run_weight, const scalar_t *run_vec_weight,
-    const int32_t *batch_pose_idx, const int mode, const int num_goals,
-    const int batch_size, const int horizon, const bool write_grad = false) {
-  const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
-  const int batch_idx = t_idx / horizon;
-  const int h_idx = t_idx - (batch_idx * horizon);
-  if (batch_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
+      // read current pose:
+      float3 position =
+        *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
+      float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
+      float4 quat   = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
 
-  // read current pose:
-  float3 position =
-      *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
-  float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
-  float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
+      // read weights:
+      float rotation_weight          = weight[0];
+      float position_weight          = weight[1];
+      float r_w_alpha                = weight[2];
+      float p_w_alpha                = weight[3];
+      bool  reach_offset             = false;
+      const float offset_tstep_ratio = offset_tstep_fraction[0];
+      int offset_tstep               = floorf(offset_tstep_ratio * horizon); // if offset_tstep
+                                                                                // is ? horizon, not
+                                                                                // in this mode
+      float d_vec_weight[6] = { 0.0 };
+      *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
+      *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
+      float3 offset_rotation = *(float3 *)&offset_waypoint[0];
+      float3 offset_position = *(float3 *)&offset_waypoint[3];
 
-  // read weights:
+      if ((h_idx < horizon - 1) && (h_idx != horizon - offset_tstep))
+      {
+        *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
+        *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
+      }
 
-  float position_weight = weight[1];
-  float p_w_alpha = weight[3];
-  float r_w_alpha = weight[2];
-  float rotation_weight = weight[0];
-  if (!write_distance) {
-    position_weight *= run_weight[h_idx];
-    rotation_weight *= run_weight[h_idx];
-    p_w_alpha *= run_weight[h_idx];
-    r_w_alpha *= run_weight[h_idx];
-    if (position_weight == 0.0 && rotation_weight == 0.0) {
-      return;
-    }
-  }
+      if (!write_distance)
+      {
+        position_weight *= run_weight[h_idx];
+        rotation_weight *= run_weight[h_idx];
+        float sum_weight = 0;
+    #pragma unroll 6
 
-  float d_vec_convergence[2];
-  *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
+        for (int i = 0; i < 6; i++)
+        {
+          sum_weight += d_vec_weight[i];
+        }
 
-  float d_vec_weight[6];
-  *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
-  *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
-  if (h_idx < horizon - 1) {
-    *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
-    *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
-  }
+        if (((position_weight == 0.0) && (rotation_weight == 0.0)) || (sum_weight == 0.0))
+        {
+          return;
+        }
+      }
 
-  float des_vec_weight[6] = {0.0};
-  float3 l_goal_position;
-  float4 l_goal_quat;
-  float distance_vec[6]; //  = {0.0};
-  float pose_distance = 0.0;
-  float position_distance = 0.0;
-  float rotation_distance = 0.0;
-  float best_distance = INFINITY;
-  float best_position_distance = 0.0;
-  float best_rotation_distance = 0.0;
-  float best_distance_vec[6] = {0.0};
-  float best_des_vec_weight[6] = {0.0};
-  int best_idx = -1.0;
-  int offset = batch_pose_idx[batch_idx];
-  if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
-    offset = (offset)*num_goals;
-  }
+      if ((horizon > 1) && (offset_tstep >= 0 && (h_idx == horizon - offset_tstep)))
+      {
+        reach_offset = true;
+      }
 
-  for (int k = 0; k < num_goals; k++) {
+      float3 l_goal_position;
+      float4 l_goal_quat;
+      float  distance_vec[6]; //  = {0.0};
+      float  pose_distance          = 0.0;
+      float  position_distance      = 0.0;
+      float  rotation_distance      = 0.0;
+      float  best_distance          = INFINITY;
+      float  best_position_distance = 0.0;
+      float  best_rotation_distance = 0.0;
+      float  best_distance_vec[6]   = { 0.0 };
+      float  d_vec_convergence[2];
 
-    l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
-    float4 gq4 =
-        *(float4 *)&goal_quat[(offset + k) * 4]; // reading qw, qx, qy, qz
-    l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
+      *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
 
-    transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
-    compute_metric_distance(
-        &distance_vec[0], pose_distance, position_distance, rotation_distance,
-        position, l_goal_position, quat, l_goal_quat,
-        &des_vec_weight[0],      //&l_vec_weight[0],
-        &d_vec_convergence[0], //&l_vec_convergence[0],
-        position_weight, p_w_alpha, rotation_weight, r_w_alpha);
-    if (pose_distance <= best_distance) {
-      best_idx = k;
-      best_distance = pose_distance;
-      best_position_distance = position_distance;
-      best_rotation_distance = rotation_distance;
-      if (write_grad) {
-         // inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
+      int best_idx = -1;
 
-#pragma unroll 6
-        for (int i = 0; i < 6; i++) {
-          best_distance_vec[i] = distance_vec[i];
-         best_des_vec_weight[i] = des_vec_weight[i];
+
+      // read offset
+      int offset = batch_pose_idx[batch_idx];
+
+      if ((mode == BATCH_GOALSET) || (mode == BATCH_GOAL))
+      {
+        offset = (offset) * num_goals;
+      }
+
+      for (int k = 0; k < num_goals; k++)
+      {
+        l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
+        float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
+        l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
+
+        compute_pose_distance<project_distance, use_metric>(&distance_vec[0],
+                                                            pose_distance,
+                                                            position_distance,
+                                                            rotation_distance,
+                                                            position,
+                                                            l_goal_position,
+                                                            quat,
+                                                            l_goal_quat,
+                                                            &d_vec_weight[0],
+
+// &l_vec_weight[0],
+                                                            &d_vec_convergence[0],
+
+// &l_vec_convergence[0],
+                                                            position_weight,
+                                                            rotation_weight,
+                                                            p_w_alpha,
+                                                            r_w_alpha,
+                                                            offset_position,
+                                                            offset_rotation,
+                                                            reach_offset);
+
+        if (pose_distance <= best_distance)
+        {
+          best_idx               = k;
+          best_distance          = pose_distance;
+          best_position_distance = position_distance;
+          best_rotation_distance = rotation_distance;
+
+          if (write_grad)
+          {
+        #pragma unroll 6
+
+            for (int i = 0; i < 6; i++)
+            {
+              best_distance_vec[i] = distance_vec[i];
+            }
+          }
+        }
+      }
+
+      // write out:
+
+      // write out pose distance:
+      out_distance[batch_idx * horizon + h_idx] = best_distance;
+
+      if (write_distance)
+      {
+        if (position_weight == 0.0)
+        {
+          best_position_distance = 0.0;
+        }
+
+        if (rotation_weight == 0.0)
+        {
+          best_rotation_distance = 0.0;
+        }
+        out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
+        out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
+      }
+      out_gidx[batch_idx * horizon + h_idx] = best_idx;
+
+      if (write_grad)
+      {
+        if (write_distance)
+        {
+          position_weight = 1;
+          rotation_weight = 1;
+        }
+
+        if (best_position_distance > 0)
+        {
+          if (use_metric)
+          {
+            best_position_distance =
+              (p_w_alpha * position_weight *
+               sinhf(p_w_alpha * best_position_distance)) /
+              (best_position_distance * coshf(p_w_alpha * best_position_distance));
+          }
+          else
+          {
+            best_position_distance = (position_weight / best_position_distance);
+          }
+
+          out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
+            best_distance_vec[0] * best_position_distance;
+          out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
+            best_distance_vec[1] * best_position_distance;
+          out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
+            best_distance_vec[2] * best_position_distance;
+        }
+        else
+        {
+          out_p_vec[batch_idx * horizon * 3 + h_idx * 3]     = 0.0;
+          out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
+          out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
+        }
+
+        if (best_rotation_distance > 0)
+        {
+          if (use_metric)
+          {
+            best_rotation_distance =
+              (r_w_alpha * rotation_weight *
+               sinhf(r_w_alpha * best_rotation_distance)) /
+              (best_rotation_distance * coshf(r_w_alpha * best_rotation_distance));
+          }
+          else
+          {
+            best_rotation_distance = rotation_weight / best_rotation_distance;
+          }
+
+          out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
+            best_distance_vec[3] * best_rotation_distance;
+          out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
+            best_distance_vec[4] * best_rotation_distance;
+          out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
+            best_distance_vec[5] * best_rotation_distance;
+        }
+        else
+        {
+          out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
+          out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
+          out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
         }
       }
     }
-  }
-
-  // add scaling metric:
-  // best_distance = log2f(acoshf(best_distance));
-  // write out:
-
-  // write out pose distance:
-  out_distance[batch_idx * horizon + h_idx] = best_distance;
-  if (write_distance) {
-    if(position_weight == 0.0)
-    {
-      best_position_distance = 0.0;
-    }
-    if (rotation_weight == 0.0)
-    {
-      best_rotation_distance = 0.0;
-    }
-    out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
-    out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
-  }
-  out_gidx[batch_idx * horizon + h_idx] = best_idx;
-
-  if (write_grad) {
-    // write gradient
-    // compute scalar term:
-    // -w * (d_vec)/ (length * length(negative +1) * acos(length)
-    // -w * (d_vec) * sinh(length) / (length * cosh(length))
-    // compute extra term:
-
-    if (write_distance) {
-      position_weight = 1.0;
-      rotation_weight = 1.0;
-    }
-
-    if (best_position_distance > 0) {
-      best_position_distance =
-          (p_w_alpha * position_weight *
-           sinhf(p_w_alpha * best_position_distance)) /
-          (best_position_distance * coshf(p_w_alpha * best_position_distance));
-
-      // best_position_distance = (position_weight/best_position_distance);
-      //  comput scalar gradient
-
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
-          best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
-          best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
-          best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
-
-    } else {
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
-      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
-    }
-
-    if (best_rotation_distance > 0) {
-      best_rotation_distance =
-          (r_w_alpha * rotation_weight *
-           sinhf(r_w_alpha * best_rotation_distance)) /
-          (best_rotation_distance * coshf(r_w_alpha * best_rotation_distance));
-
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
-          best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
-          best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
-          best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
-    } else {
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
-      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
-    }
-  }
-}
-
-} // namespace
+  } // namespace
 }
 
 std::vector<torch::Tensor>
 pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
               torch::Tensor out_rotation_distance,
-              torch::Tensor distance_p_vector, // batch size, 3
-              torch::Tensor distance_q_vector, // batch size, 4
+              torch::Tensor distance_p_vector,      // batch size, 3
+              torch::Tensor distance_q_vector,      // batch size, 4
               torch::Tensor out_gidx,
               const torch::Tensor current_position, // batch_size, 3
               const torch::Tensor goal_position,    // n_boxes, 3
               const torch::Tensor current_quat, const torch::Tensor goal_quat,
-              const torch::Tensor vec_weight, // n_boxes, 4, 4
+              const torch::Tensor vec_weight,       // n_boxes, 4, 4
               const torch::Tensor weight, const torch::Tensor vec_convergence,
               const torch::Tensor run_weight,
               const torch::Tensor run_vec_weight,
+              const torch::Tensor offset_waypoint,
+              const torch::Tensor offset_tstep_fraction,
               const torch::Tensor batch_pose_idx, // batch_size, 1
               const int batch_size, const int horizon, const int mode,
               const int num_goals = 1, const bool compute_grad = false,
-              const bool write_distance = true, const bool use_metric = false) {
+              const bool write_distance = true, const bool use_metric = false,
+              const bool project_distance = true)
+{
   using namespace Curobo::Pose;
+
   // we compute the warp threads based on number of boxes:
   assert(batch_pose_idx.size(0) == batch_size);
+
   // TODO: verify this math
   // const int batch_size = out_distance.size(0);
   assert(run_weight.size(-1) == horizon);
-  const int bh = batch_size * horizon;
+  const int bh        = batch_size * horizon;
   int threadsPerBlock = bh;
-  if (bh > 128) {
+
+  if (bh > 128)
+  {
     threadsPerBlock = 128;
   }
 
@@ -641,115 +671,241 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
   int blocksPerGrid = (bh + threadsPerBlock - 1) / threadsPerBlock;
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (use_metric) {
-    if (write_distance)
-    {
-       AT_DISPATCH_FLOATING_TYPES(
-        current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_metric_kernel
-              <scalar_t, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_position_distance.data_ptr<scalar_t>(),
-                  out_rotation_distance.data_ptr<scalar_t>(),
-                  distance_p_vector.data_ptr<scalar_t>(),
-                  distance_q_vector.data_ptr<scalar_t>(),
-                  out_gidx.data_ptr<int32_t>(),
-                  current_position.data_ptr<scalar_t>(),
-                  goal_position.data_ptr<scalar_t>(),
-                  current_quat.data_ptr<scalar_t>(),
-                  goal_quat.data_ptr<scalar_t>(),
-                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                  vec_convergence.data_ptr<scalar_t>(),
-                  run_weight.data_ptr<scalar_t>(),
-                  run_vec_weight.data_ptr<scalar_t>(),
-                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-                  batch_size, horizon, compute_grad);
-        }));
-    
-    }
-    else
-    {
 
-    
-    AT_DISPATCH_FLOATING_TYPES(
-        current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_metric_kernel
-              // goalset_pose_distance_kernel
-              <scalar_t, false><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_position_distance.data_ptr<scalar_t>(),
-                  out_rotation_distance.data_ptr<scalar_t>(),
-                  distance_p_vector.data_ptr<scalar_t>(),
-                  distance_q_vector.data_ptr<scalar_t>(),
-                  out_gidx.data_ptr<int32_t>(),
-                  current_position.data_ptr<scalar_t>(),
-                  goal_position.data_ptr<scalar_t>(),
-                  current_quat.data_ptr<scalar_t>(),
-                  goal_quat.data_ptr<scalar_t>(),
-                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                  vec_convergence.data_ptr<scalar_t>(),
-                  run_weight.data_ptr<scalar_t>(),
-                  run_vec_weight.data_ptr<scalar_t>(),
-                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-                  batch_size, horizon, compute_grad);
-        }));
-    }
-  } else {
-    if(write_distance)
+  if (project_distance)
+  {
+    if (use_metric)
     {
-       AT_DISPATCH_FLOATING_TYPES(
-        current_position.scalar_type(), "batch_pose_distance", ([&] {
-          // goalset_pose_distance_metric_kernel
-          goalset_pose_distance_kernel<scalar_t, true>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_position_distance.data_ptr<scalar_t>(),
-                  out_rotation_distance.data_ptr<scalar_t>(),
-                  distance_p_vector.data_ptr<scalar_t>(),
-                  distance_q_vector.data_ptr<scalar_t>(),
-                  out_gidx.data_ptr<int32_t>(),
-                  current_position.data_ptr<scalar_t>(),
-                  goal_position.data_ptr<scalar_t>(),
-                  current_quat.data_ptr<scalar_t>(),
-                  goal_quat.data_ptr<scalar_t>(),
-                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                  vec_convergence.data_ptr<scalar_t>(),
-                  run_weight.data_ptr<scalar_t>(),
-                  run_vec_weight.data_ptr<scalar_t>(),
-                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-                  batch_size, horizon, compute_grad);
-                          }));
+      if (write_distance)
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel
+          <scalar_t, true, true, true><< < blocksPerGrid, threadsPerBlock, 0,
+            stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
+      else
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel
+          <scalar_t, false, true, true><< < blocksPerGrid, threadsPerBlock, 0,
+            stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
     }
     else
     {
-    AT_DISPATCH_FLOATING_TYPES(
-        current_position.scalar_type(), "batch_pose_distance", ([&] {
-          // goalset_pose_distance_metric_kernel
-          goalset_pose_distance_kernel<scalar_t, false>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_position_distance.data_ptr<scalar_t>(),
-                  out_rotation_distance.data_ptr<scalar_t>(),
-                  distance_p_vector.data_ptr<scalar_t>(),
-                  distance_q_vector.data_ptr<scalar_t>(),
-                  out_gidx.data_ptr<int32_t>(),
-                  current_position.data_ptr<scalar_t>(),
-                  goal_position.data_ptr<scalar_t>(),
-                  current_quat.data_ptr<scalar_t>(),
-                  goal_quat.data_ptr<scalar_t>(),
-                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                  vec_convergence.data_ptr<scalar_t>(),
-                  run_weight.data_ptr<scalar_t>(),
-                  run_vec_weight.data_ptr<scalar_t>(),
-                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-                  batch_size, horizon, compute_grad);
+      if (write_distance)
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel<scalar_t, true, true, false>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
         }));
+      }
+      else
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel<scalar_t, false, true, false>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
+    }
+  }
+  else
+  {
+    if (use_metric)
+    {
+      if (write_distance)
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel
+          <scalar_t, true, false, true><< < blocksPerGrid, threadsPerBlock, 0,
+            stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
+      else
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel
+          <scalar_t, false, false, true><< < blocksPerGrid, threadsPerBlock, 0,
+            stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
+    }
+    else
+    {
+      if (write_distance)
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel<scalar_t, true, false, false>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
+      else
+      {
+        AT_DISPATCH_FLOATING_TYPES(
+          current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_kernel<scalar_t, false, false, false>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            out_distance.data_ptr<scalar_t>(),
+            out_position_distance.data_ptr<scalar_t>(),
+            out_rotation_distance.data_ptr<scalar_t>(),
+            distance_p_vector.data_ptr<scalar_t>(),
+            distance_q_vector.data_ptr<scalar_t>(),
+            out_gidx.data_ptr<int32_t>(),
+            current_position.data_ptr<scalar_t>(),
+            goal_position.data_ptr<scalar_t>(),
+            current_quat.data_ptr<scalar_t>(),
+            goal_quat.data_ptr<scalar_t>(),
+            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+            vec_convergence.data_ptr<scalar_t>(),
+            run_weight.data_ptr<scalar_t>(),
+            run_vec_weight.data_ptr<scalar_t>(),
+            offset_waypoint.data_ptr<scalar_t>(),
+            offset_tstep_fraction.data_ptr<scalar_t>(),
+            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_size, horizon, compute_grad);
+        }));
+      }
     }
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_distance,      out_position_distance, out_rotation_distance,
-          distance_p_vector, distance_q_vector,     out_gidx};
+  return { out_distance,      out_position_distance, out_rotation_distance,
+           distance_p_vector, distance_q_vector,     out_gidx };
 }
 
 std::vector<torch::Tensor>
@@ -758,10 +914,10 @@ backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
                        const torch::Tensor grad_p_distance, // n_boxes, 3
                        const torch::Tensor grad_q_distance,
                        const torch::Tensor pose_weight,
-                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+                       const torch::Tensor grad_p_vec,      // n_boxes, 4, 4
                        const torch::Tensor grad_q_vec, const int batch_size,
-                       const bool use_distance = false) {
-
+                       const bool use_distance = false)
+{
   // we compute the warp threads based on number of boxes:
 
   // TODO: verify this math
@@ -769,7 +925,9 @@ backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
   using namespace Curobo::Pose;
 
   int threadsPerBlock = batch_size;
-  if (batch_size > 128) {
+
+  if (batch_size > 128)
+  {
     threadsPerBlock = 128;
   }
 
@@ -779,35 +937,37 @@ backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  if (use_distance) {
-
+  if (use_distance)
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        grad_distance.scalar_type(), "backward_pose_distance", ([&] {
-          backward_pose_distance_kernel<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_grad_p.data_ptr<scalar_t>(),
-                  out_grad_q.data_ptr<scalar_t>(),
-                  grad_distance.data_ptr<scalar_t>(),
-                  grad_p_distance.data_ptr<scalar_t>(),
-                  grad_q_distance.data_ptr<scalar_t>(),
-                  pose_weight.data_ptr<scalar_t>(),
-                  grad_p_vec.data_ptr<scalar_t>(),
-                  grad_q_vec.data_ptr<scalar_t>(), batch_size);
-        }));
-  } else {
+      grad_distance.scalar_type(), "backward_pose_distance", ([&] {
+      backward_pose_distance_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_grad_p.data_ptr<scalar_t>(),
+        out_grad_q.data_ptr<scalar_t>(),
+        grad_distance.data_ptr<scalar_t>(),
+        grad_p_distance.data_ptr<scalar_t>(),
+        grad_q_distance.data_ptr<scalar_t>(),
+        pose_weight.data_ptr<scalar_t>(),
+        grad_p_vec.data_ptr<scalar_t>(),
+        grad_q_vec.data_ptr<scalar_t>(), batch_size);
+    }));
+  }
+  else
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        grad_distance.scalar_type(), "backward_pose", ([&] {
-          backward_pose_kernel<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_grad_p.data_ptr<scalar_t>(),
-                  out_grad_q.data_ptr<scalar_t>(),
-                  grad_distance.data_ptr<scalar_t>(),
-                  pose_weight.data_ptr<scalar_t>(),
-                  grad_p_vec.data_ptr<scalar_t>(),
-                  grad_q_vec.data_ptr<scalar_t>(), batch_size);
-        }));
+      grad_distance.scalar_type(), "backward_pose", ([&] {
+      backward_pose_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_grad_p.data_ptr<scalar_t>(),
+        out_grad_q.data_ptr<scalar_t>(),
+        grad_distance.data_ptr<scalar_t>(),
+        pose_weight.data_ptr<scalar_t>(),
+        grad_p_vec.data_ptr<scalar_t>(),
+        grad_q_vec.data_ptr<scalar_t>(), batch_size);
+    }));
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_grad_p, out_grad_q};
-}
\ No newline at end of file
+  return { out_grad_p, out_grad_q };
+}
diff --git a/src/curobo/curobolib/cpp/self_collision_kernel.cu b/src/curobo/curobolib/cpp/self_collision_kernel.cu
index 439c0ec..b9aaeb5 100644
--- a/src/curobo/curobolib/cpp/self_collision_kernel.cu
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
@@ -24,420 +24,514 @@
 #define GOALSET 2
 #define BATCH_GOALSET 3
 
-namespace Curobo {
-
-namespace Geometry {
-
-template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
-  if (var < 0)
-    return 0;
-  else
-    return var;
-}
-
-template <typename scalar_t>
-__global__ void self_collision_distance_kernel(
-    scalar_t *out_distance,        // batch x 1
-    scalar_t *out_vec,             // batch x nspheres x 4
-    const scalar_t *robot_spheres, // batch x nspheres x 4
-    const scalar_t *collision_threshold, const int batch_size,
-    const int nspheres, const scalar_t *weight, const bool write_grad = false) {
-  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (batch_idx >= batch_size) {
-    return;
-  }
-  float r_diff, distance;
-  float max_penetration = 0;
-  float3 sph1, sph2, dist_vec;
-  int sph1_idx = -1;
-  int sph2_idx = -1;
-  // iterate over spheres:
-  for (int i = 0; i < nspheres; i++) {
-    sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
-    for (int j = i + 1; j < nspheres; j++) {
-      r_diff = collision_threshold[i * nspheres + j];
-      if (isinf(r_diff)) {
-        continue;
-      }
-      sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
-      // compute sphere distance:
-      distance = relu(r_diff - length(sph1 - sph2));
-      if (distance > max_penetration) {
-        max_penetration = distance;
-        sph1_idx = i;
-        sph2_idx = j;
-        if (write_grad) {
-          dist_vec = (sph1 - sph2) / distance;
-        }
-      }
+namespace Curobo
+{
+  namespace Geometry
+  {
+    template<typename scalar_t>__inline__ __device__ scalar_t relu(scalar_t var)
+    {
+      if (var < 0)
+        return 0;
+      else
+        return var;
     }
-  }
 
-  // write out pose distance:
-  if (max_penetration > 0) {
-    out_distance[batch_idx] = weight[0] * max_penetration;
-    if (write_grad) {
-      *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
-          weight[0] * -1 * dist_vec;
-      *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
-          weight[0] * dist_vec;
-    }
-  }
-}
+    template<typename scalar_t>
+    __global__ void self_collision_distance_kernel(
+      scalar_t *out_distance,        // batch x 1
+      scalar_t *out_vec,             // batch x nspheres x 4
+      const scalar_t *robot_spheres, // batch x nspheres x 4
+      const scalar_t *collision_threshold, const int batch_size,
+      const int nspheres, const scalar_t *weight, const bool write_grad = false)
+    {
+      const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-typedef struct {
-  float d;
-  int16_t i;
-  int16_t j;
-} dist_t;
+      if (batch_idx >= batch_size)
+      {
+        return;
+      }
+      float  r_diff, distance;
+      float  max_penetration = 0;
+      float3 sph1, sph2, dist_vec;
+      int    sph1_idx = -1;
+      int    sph2_idx = -1;
 
+      // iterate over spheres:
+      for (int i = 0; i < nspheres; i++)
+      {
+        sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
 
+        for (int j = i + 1; j < nspheres; j++)
+        {
+          r_diff = collision_threshold[i * nspheres + j];
 
-///////////////////////////////////////////////////////////
-// n warps per row
-// ndpt rows per warp
-///////////////////////////////////////////////////////////
-template <typename scalar_t>
-__global__ void self_collision_distance_kernel4(
-    scalar_t *out_distance,        // batch x 1
-    scalar_t *out_vec,             // batch x nspheres x 3
-    const scalar_t *robot_spheres, // batch x nspheres x 3
-    const scalar_t *offsets, const uint8_t *coll_matrix, const int batch_size,
-    const int nspheres,
-    const int ndpt, // number of distances to be computed per thread
-    const int nwpr, // number of warps per row
-    const scalar_t *weight, uint8_t *sparse_index,
-    const bool write_grad = false) {
-  int batch_idx = blockIdx.x;
-  int warp_idx = threadIdx.x / 32;
-  int i = ndpt * (warp_idx / nwpr); // starting row number for this warp
-  int j = (warp_idx % nwpr) * 32;   // starting column number for this warp
-
-  dist_t max_d = {.d = 0.0, .i = 0, .j = 0};
-  __shared__ dist_t max_darr[32];
-
-  // Optimization: About 1/3 of the warps will have no work.
-  // We compute distances only when i<j. If i is never <j in this warp, we have
-  // no work
-  if (i > j + 31) { // this warp has no work
-    max_darr[warp_idx] = max_d;
-    return;
-  }
-
-  // load robot_spheres to shared memory
-  extern __shared__ float4 __rs_shared[];
-  if (threadIdx.x < nspheres) {
-    float4 sph = *(float4 *)&robot_spheres[4 * (batch_idx * nspheres + threadIdx.x)];
-    //float4 sph = make_float4(robot_spheres[3 * (batch_idx * nspheres + threadIdx.x)],
-    //robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 1], 
-    //robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 2],
-    //robot_spheres_radius[threadIdx.x]) ;
-    sph.w += offsets[threadIdx.x];
-    __rs_shared[threadIdx.x] = sph;
-  }
-  __syncthreads();
-
-  //////////////////////////////////////////////////////
-  // Compute distances and store the maximum per thread
-  // in registers (max_d).
-  // Each thread computes upto ndpt distances.
-  // two warps per row
-  //////////////////////////////////////////////////////
-  // int nspheres_2 = nspheres * nspheres;
-
-  j = j + threadIdx.x % 32; // column number for this thread
-
-  float4 sph2;
-  if (j < nspheres) {
-    sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
-
-#pragma unroll 16
-    for (int k = 0; k < ndpt; k++, i++) { // increment i also here
-      if (i < nspheres && j > i) {
-        // check if self collision is allowed here:
-        if (coll_matrix[i * nspheres + j] == 1) {
-
-          float4 sph1 = __rs_shared[i];
-          if (sph1.w <= 0.0 || sph2.w <= 0.0)
+          if (isinf(r_diff))
           {
             continue;
           }
-          float r_diff = sph1.w + sph2.w;
+          sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
+
+          // compute sphere distance:
+          distance = relu(r_diff - length(sph1 - sph2));
+
+          if (distance > max_penetration)
+          {
+            max_penetration = distance;
+            sph1_idx        = i;
+            sph2_idx        = j;
+
+            if (write_grad)
+            {
+              dist_vec = (sph1 - sph2) / distance;
+            }
+          }
+        }
+      }
+
+      // write out pose distance:
+      if (max_penetration > 0)
+      {
+        out_distance[batch_idx] = weight[0] * max_penetration;
+
+        if (write_grad)
+        {
+          *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
+            weight[0] * -1 * dist_vec;
+          *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
+            weight[0] * dist_vec;
+        }
+      }
+    }
+
+    typedef struct {
+      float   d;
+      int16_t i;
+      int16_t j;
+    } dist_t;
+
+
+    ///////////////////////////////////////////////////////////
+    // n warps per row
+    // ndpt rows per warp
+    ///////////////////////////////////////////////////////////
+    template<typename scalar_t>
+    __global__ void self_collision_distance_kernel4(
+      scalar_t *out_distance,        // batch x 1
+      scalar_t *out_vec,             // batch x nspheres x 3
+      const scalar_t *robot_spheres, // batch x nspheres x 3
+      const scalar_t *offsets, const uint8_t *coll_matrix, const int batch_size,
+      const int nspheres,
+      const int ndpt,                // number of distances to be computed per thread
+      const int nwpr,                // number of warps per row
+      const scalar_t *weight, uint8_t *sparse_index,
+      const bool write_grad = false)
+    {
+      int batch_idx = blockIdx.x;
+      int warp_idx  = threadIdx.x / 32;
+      int i         = ndpt * (warp_idx / nwpr); // starting row number for this warp
+      int j         = (warp_idx % nwpr) * 32;   // starting column number for this warp
+
+      dist_t max_d = { .d = 0.0, .i = 0, .j = 0 };
+      __shared__ dist_t max_darr[32];
+
+      // Optimization: About 1/3 of the warps will have no work.
+      // We compute distances only when i<j. If i is never <j in this warp, we have
+      // no work
+      if (i > j + 31) // this warp has no work
+      {
+        max_darr[warp_idx] = max_d;
+        return;
+      }
+
+      // load robot_spheres to shared memory
+      extern __shared__ float4 __rs_shared[];
+
+      if (threadIdx.x < nspheres)
+      {
+        float4 sph = *(float4 *)&robot_spheres[4 * (batch_idx * nspheres + threadIdx.x)];
+
+        // float4 sph = make_float4(robot_spheres[3 * (batch_idx * nspheres + threadIdx.x)],
+        // robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 1],
+        // robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 2],
+        // robot_spheres_radius[threadIdx.x]) ;
+        sph.w                   += offsets[threadIdx.x];
+        __rs_shared[threadIdx.x] = sph;
+      }
+      __syncthreads();
+
+      //////////////////////////////////////////////////////
+      // Compute distances and store the maximum per thread
+      // in registers (max_d).
+      // Each thread computes upto ndpt distances.
+      // two warps per row
+      //////////////////////////////////////////////////////
+      // int nspheres_2 = nspheres * nspheres;
+
+      j = j + threadIdx.x % 32; // column number for this thread
+
+      float4 sph2;
+
+      if (j < nspheres)
+      {
+        sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
+
+#pragma unroll 16
+
+        for (int k = 0; k < ndpt; k++, i++) // increment i also here
+        {
+          if ((i < nspheres) && (j > i))
+          {
+            // check if self collision is allowed here:
+            if (coll_matrix[i * nspheres + j] == 1)
+            {
+              float4 sph1 = __rs_shared[i];
+
+              if ((sph1.w <= 0.0) || (sph2.w <= 0.0))
+              {
+                continue;
+              }
+              float r_diff = sph1.w + sph2.w;
+              float d      = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
+                                  (sph1.y - sph2.y) * (sph1.y - sph2.y) +
+                                  (sph1.z - sph2.z) * (sph1.z - sph2.z));
+
+              // float distance = max((float)0.0, (float)(r_diff - d));
+              float distance = r_diff - d;
+
+              // printf("%d, %d: (%d, %d) %f new\n", blockIdx.x, threadIdx.x, i, j,
+              // distance);
+              if (distance > max_d.d)
+              {
+                max_d.d = distance;
+                max_d.i = i;
+                max_d.j = j;
+              }
+            }
+          }
+        }
+      }
+
+      // max_d has the result max for this thread
+
+      //////////////////////////////////////////////////////
+      // Reduce gridDim.x values using gridDim.x threads
+      //////////////////////////////////////////////////////
+
+      // Perform warp-wide reductions
+      // Optimization: Skip the reduction if all the values are zero
+      unsigned zero_mask = __ballot_sync(
+        0xffffffff, max_d.d != 0.0); // we expect most values to be 0. So,
+
+      // zero_mask should be 0 in the common case.
+      if (zero_mask != 0)            // some of the values are non-zero
+      {
+        unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
+
+        if (threadIdx.x < blockDim.x)
+        {
+          // dist_t max_d = dist_sh[threadIdx.x];
+          for (int offset = 16; offset > 0; offset /= 2)
+          {
+            uint64_t nd     = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);
+            dist_t   d_temp = *(dist_t *)&nd;
+
+            if (d_temp.d > max_d.d)
+            {
+              max_d = d_temp;
+            }
+          }
+        }
+      }
+
+      // thread0 in the warp has the max_d for the warp
+      if (threadIdx.x % 32 == 0)
+      {
+        max_darr[warp_idx] = max_d;
+
+        // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
+        // blockIdx.x, max_d);
+      }
+
+      if (threadIdx.x < nspheres)
+      {
+        if (write_grad && (sparse_index[batch_idx * nspheres + threadIdx.x] != 0))
+        {
+          *(float4 *)&out_vec[batch_idx * nspheres * 4 + threadIdx.x * 4] =
+            make_float4(0.0);
+          sparse_index[batch_idx * nspheres + threadIdx.x] = 0;
+        }
+      }
+      __syncthreads();
+
+      if (threadIdx.x == 0)
+      {
+        dist_t max_d = max_darr[0];
+
+        // TODO: This can be parallized
+        for (int i = 1; i < (blockDim.x + 31) / 32; i++)
+        {
+          if (max_darr[i].d > max_d.d)
+          {
+            max_d = max_darr[i];
+          }
+        }
+
+        //////////////////////////////////////////////////////
+        // Write out the final results
+        //////////////////////////////////////////////////////
+        if (max_d.d != 0.0)
+        {
+          out_distance[batch_idx] = weight[0] * max_d.d;
+
+          if (write_grad)
+          {
+            float3 sph1 =
+              *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.i)];
+            float3 sph2 =
+              *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.j)];
+            float3 dist_vec = (sph1 - sph2) / max_d.d;
+            *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.i * 4] =
+              weight[0] * -1 * dist_vec;
+            *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.j * 4] =
+              weight[0] *  dist_vec;
+            sparse_index[batch_idx * nspheres + max_d.i] = 1;
+            sparse_index[batch_idx * nspheres + max_d.j] = 1;
+          }
+        }
+        else
+        {
+          out_distance[batch_idx] = 0;
+        }
+      }
+    }
+
+    template<typename scalar_t, int ndpt, int NBPB>
+    __global__ void self_collision_distance_kernel7(
+      scalar_t *out_distance,        // batch x 1
+      scalar_t *out_vec,             // batch x nspheres x 3
+      uint8_t *sparse_index,
+      const scalar_t *robot_spheres, // batch x nspheres x 3
+      const scalar_t *offsets,       // nspheres
+      const scalar_t *weight, const int16_t *locations_, const int batch_size,
+      const int nspheres, const bool write_grad = false)
+    {
+      uint32_t batch_idx = blockIdx.x * NBPB;
+      uint8_t  nbpb      = min(NBPB, batch_size - batch_idx);
+
+      if (nbpb == 0)
+        return;
+
+      // Layout in shared memory:
+      //     sphere1[batch=0] sphere1[batch=1] sphere1[batch=2] sphere1[batch=4]
+      //     sphere2[batch=0] sphere2[batch=1] sphere2[batch=2] sphere2[batch=4]
+      //     ...
+      extern __shared__ float4 __rs_shared[];
+
+      if (threadIdx.x < nspheres) // threadIdx.x is sphere index
+      {
+#pragma unroll
+
+        for (int l = 0; l < nbpb; l++)
+        {
+          float4 sph = *(float4 *)&robot_spheres[4 * ((batch_idx + l) * nspheres + threadIdx.x)];
+
+          // float4 sph = make_float4(
+          //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x)],
+          //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 1],
+          //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 2],
+          //  robot_spheres_radius[threadIdx.x]
+          // );
+
+          sph.w                              += offsets[threadIdx.x];
+          __rs_shared[NBPB * threadIdx.x + l] = sph;
+        }
+      }
+      __syncthreads();
+
+      //////////////////////////////////////////////////////
+      // Compute distances and store the maximum per thread
+      // in registers (max_d).
+      // Each thread computes upto ndpt distances.
+      //////////////////////////////////////////////////////
+      dist_t  max_d[NBPB] = {{ .d = 0.0, .i = 0, .j = 0 } };
+      int16_t indices[ndpt * 2];
+
+      for (uint8_t i = 0; i < ndpt * 2; i++)
+      {
+        indices[i] = locations_[(threadIdx.x) * 2 * ndpt + i];
+      }
+
+#pragma unroll
+
+      for (uint8_t k = 0; k < ndpt; k++)
+      {
+        // We are iterating through ndpt pair of spheres across batch
+        // if we increase ndpt, then we can compute for more spheres?
+        int i = indices[k * 2];
+        int j = indices[k * 2 + 1];
+
+        if ((i == -1) || (j == -1))
+          continue;
+
+#pragma unroll
+
+        for (uint16_t l = 0; l < nbpb; l++) // iterate through nbpb batches
+        {
+          float4 sph1 = __rs_shared[NBPB * i + l];
+          float4 sph2 = __rs_shared[NBPB * j + l];
+
+          if ((sph1.w <= 0.0) || (sph2.w <= 0.0))
+          {
+            continue;
+          }
+          float r_diff =
+            sph1.w + sph2.w; // sum of two radii, radii include respective offsets
           float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
                          (sph1.y - sph2.y) * (sph1.y - sph2.y) +
                          (sph1.z - sph2.z) * (sph1.z - sph2.z));
-          //float distance = max((float)0.0, (float)(r_diff - d));
-          float distance = r_diff - d;
-          // printf("%d, %d: (%d, %d) %f new\n", blockIdx.x, threadIdx.x, i, j,
-          // distance);
-          if (distance > max_d.d) {
-            max_d.d = distance;
-            max_d.i = i;
-            max_d.j = j;
+          float f_diff = r_diff - d;
+
+          if (f_diff > max_d[l].d)
+          {
+            max_d[l].d = f_diff;
+            max_d[l].i = i;
+            max_d[l].j = j;
           }
         }
       }
-    }
-  }
-  // max_d has the result max for this thread
 
-  //////////////////////////////////////////////////////
-  // Reduce gridDim.x values using gridDim.x threads
-  //////////////////////////////////////////////////////
+      // max_d has the result max for this thread
 
-  // Perform warp-wide reductions
-  // Optimization: Skip the reduction if all the values are zero
-  unsigned zero_mask = __ballot_sync(
-      0xffffffff, max_d.d != 0.0); // we expect most values to be 0. So,
-                                   // zero_mask should be 0 in the common case.
-  if (zero_mask != 0) {            // some of the values are non-zero
-    unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
-    if (threadIdx.x < blockDim.x) {
-      // dist_t max_d = dist_sh[threadIdx.x];
-      for (int offset = 16; offset > 0; offset /= 2) {
-        uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);
-        dist_t d_temp = *(dist_t *)&nd;
-        if (d_temp.d > max_d.d) {
-          max_d = d_temp;
-        }
-      }
-    }
-  }
+      //////////////////////////////////////////////////////
+      // Reduce gridDim.x values using gridDim.x threads
+      //////////////////////////////////////////////////////
+      // We find the sum across 32 threads. Hence, we are limited to running all our self collision
+      // distances for a batch_idx to 32 threads.
 
-  // thread0 in the warp has the max_d for the warp
-  if (threadIdx.x % 32 == 0) {
-    max_darr[warp_idx] = max_d;
-    // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
-    // blockIdx.x, max_d);
-  }
-
-  if (threadIdx.x < nspheres) {
-    if (write_grad && sparse_index[batch_idx * nspheres + threadIdx.x] != 0) {
-
-      *(float4 *)&out_vec[batch_idx * nspheres * 4 + threadIdx.x * 4] =
-          make_float4(0.0);
-      sparse_index[batch_idx * nspheres + threadIdx.x] = 0;
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    dist_t max_d = max_darr[0];
-    // TODO: This can be parallized
-    for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
-      if (max_darr[i].d > max_d.d) {
-        max_d = max_darr[i];
-      }
-    }
-    //////////////////////////////////////////////////////
-    // Write out the final results
-    //////////////////////////////////////////////////////
-    if (max_d.d != 0.0) {
-      out_distance[batch_idx] = weight[0] * max_d.d;
-      if (write_grad) {
-        float3 sph1 =
-            *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.i)];
-        float3 sph2 =
-            *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.j)];
-        float3 dist_vec = (sph1 - sph2) / max_d.d;
-        *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.i * 4] =
-            weight[0] * -1 * dist_vec;
-        *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.j * 4] =
-            weight[0] *  dist_vec;
-        sparse_index[batch_idx * nspheres + max_d.i] = 1;
-        sparse_index[batch_idx * nspheres + max_d.j] = 1;
-      }
-    } else {
-      out_distance[batch_idx] = 0;
-    }
-  }
-}
-template <typename scalar_t, int ndpt, int NBPB>
-__global__ void self_collision_distance_kernel7(
-    scalar_t *out_distance, // batch x 1
-    scalar_t *out_vec,      // batch x nspheres x 3
-    uint8_t *sparse_index,
-    const scalar_t *robot_spheres, // batch x nspheres x 3
-    const scalar_t *offsets,       // nspheres
-    const scalar_t *weight, const int16_t *locations_, const int batch_size,
-    const int nspheres, const bool write_grad = false) {
-  uint32_t batch_idx = blockIdx.x * NBPB;
-  uint8_t nbpb = min(NBPB, batch_size - batch_idx);
-
-  if (nbpb == 0)
-    return;
-
-  // Layout in shared memory:
-  //     sphere1[batch=0] sphere1[batch=1] sphere1[batch=2] sphere1[batch=4]
-  //     sphere2[batch=0] sphere2[batch=1] sphere2[batch=2] sphere2[batch=4]
-  //     ...
-  extern __shared__ float4 __rs_shared[];
-  if (threadIdx.x < nspheres) { // threadIdx.x is sphere index
-#pragma unroll
-    for (int l = 0; l < nbpb; l++) {
-      float4 sph = *(float4 *)&robot_spheres[4 * ((batch_idx + l) * nspheres + threadIdx.x)];
-
-      //float4 sph = make_float4(
-      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x)],
-      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 1],
-      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 2],
-      //  robot_spheres_radius[threadIdx.x]
-      //);
-      
-      sph.w += offsets[threadIdx.x];
-      __rs_shared[NBPB * threadIdx.x + l] = sph;
-    }
-  }
-  __syncthreads();
-
-  //////////////////////////////////////////////////////
-  // Compute distances and store the maximum per thread
-  // in registers (max_d).
-  // Each thread computes upto ndpt distances.
-  //////////////////////////////////////////////////////
-  dist_t max_d[NBPB] = {{.d = 0.0, .i = 0, .j = 0}};
-  int16_t indices[ndpt * 2];
-  for (uint8_t i =0; i< ndpt * 2; i++)
-  {
-    indices[i] = locations_[(threadIdx.x) * 2 * ndpt + i];
-    
-  }
+      __shared__ dist_t max_darr[32 * NBPB];
 
 #pragma unroll
-  for (uint8_t k = 0; k < ndpt; k++) { 
-    // We are iterating through ndpt pair of spheres across batch
-    // if we increase ndpt, then we can compute for more spheres?
-    int i = indices[k * 2];
-    int j = indices[k * 2 + 1];
-    if (i == -1 || j == -1)
-      continue;
 
-#pragma unroll
-    for (uint16_t l = 0; l < nbpb; l++) { // iterate through nbpb batches
-      float4 sph1 = __rs_shared[NBPB * i + l];
-      float4 sph2 = __rs_shared[NBPB * j + l];
-      if (sph1.w <= 0.0 || sph2.w <= 0.0)
+      for (uint8_t l = 0; l < nbpb; l++)
       {
-        continue;
+        // Perform warp-wide reductions
+        // Optimization: Skip the reduction if all the values are zero
+        unsigned zero_mask = __ballot_sync(
+          0xffffffff,
+          max_d[l].d != 0.0); // we expect most values to be 0. So, zero_mask
+
+        // should be 0 in the common case.
+        if (zero_mask != 0)   // some of the values are non-zero
+        {
+          unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
+
+          if (threadIdx.x < blockDim.x)
+          {
+            // dist_t max_d = dist_sh[threadIdx.x];
+            for (int offset = 16; offset > 0; offset /= 2)
+            {
+              // the offset here is linked to ndpt?
+              uint64_t nd     = __shfl_down_sync(mask, *(uint64_t *)&max_d[l], offset);
+              dist_t   d_temp = *(dist_t *)&nd;
+
+              if (d_temp.d > max_d[l].d)
+              {
+                max_d[l] = d_temp;
+              }
+            }
+          }
+        }
+
+        // thread0 in the warp has the max_d for the warp
+        if (threadIdx.x % 32 == 0)
+        {
+          max_darr[(threadIdx.x / 32) + 32 * l] = max_d[l];
+
+          // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
+          // blockIdx.x, max_d);
+        }
       }
-      float r_diff =
-          sph1.w + sph2.w; // sum of two radii, radii include respective offsets
-      float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
-                     (sph1.y - sph2.y) * (sph1.y - sph2.y) +
-                     (sph1.z - sph2.z) * (sph1.z - sph2.z));
-      float f_diff = r_diff - d;
-      if (f_diff > max_d[l].d) {
-        max_d[l].d = f_diff;
-        max_d[l].i = i;
-        max_d[l].j = j;
+
+      if (threadIdx.x < nspheres)
+      {
+        for (int l = 0; l < nbpb; l++)
+        {
+          if (write_grad &&
+              (sparse_index[(batch_idx + l) * nspheres + threadIdx.x] != 0))
+          {
+            *(float4 *)&out_vec[(batch_idx + l) * nspheres * 4 + threadIdx.x * 4] =
+              make_float4(0.0);
+            sparse_index[(batch_idx + l) * nspheres + threadIdx.x] = 0;
+          }
+        }
       }
-    }
-  }
-  // max_d has the result max for this thread
-
-  //////////////////////////////////////////////////////
-  // Reduce gridDim.x values using gridDim.x threads
-  //////////////////////////////////////////////////////
-  // We find the sum across 32 threads. Hence, we are limited to running all our self collision
-  // distances for a batch_idx to 32 threads.
-
-  __shared__ dist_t max_darr[32 * NBPB];
+      __syncthreads();
 
+      if (threadIdx.x == 0)
+      {
 #pragma unroll
-  for (uint8_t l = 0; l < nbpb; l++) {
-    // Perform warp-wide reductions
-    // Optimization: Skip the reduction if all the values are zero
-    unsigned zero_mask = __ballot_sync(
-        0xffffffff,
-        max_d[l].d != 0.0); // we expect most values to be 0. So, zero_mask
-                            // should be 0 in the common case.
-    if (zero_mask != 0) {   // some of the values are non-zero
-      unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
-      if (threadIdx.x < blockDim.x) {
-        // dist_t max_d = dist_sh[threadIdx.x];
-        for (int offset = 16; offset > 0; offset /= 2) {
-          // the offset here is linked to ndpt? 
-          uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d[l], offset);
-          dist_t d_temp = *(dist_t *)&nd;
-          if (d_temp.d > max_d[l].d) {
-            max_d[l] = d_temp;
+
+        for (uint8_t l = 0; l < nbpb; l++)
+        {
+          dist_t max_d = max_darr[l * 32];
+
+          // TODO: This can be parallized
+          for (int i = 1; i < (blockDim.x + 31) / 32; i++)
+          {
+            if (max_darr[l * 32 + i].d > max_d.d)
+            {
+              max_d = max_darr[l * 32 + i];
+            }
+          }
+
+          //////////////////////////////////////////////////////
+          // Write out the final results
+          //////////////////////////////////////////////////////
+          if (max_d.d != 0.0)
+          {
+            out_distance[batch_idx + l] = weight[0] * max_d.d;
+
+            if (write_grad)
+            {
+              float3 sph1 =
+                *(float3 *)&robot_spheres[4 *
+                                          ((batch_idx + l) * nspheres + max_d.i)];
+              float3 sph2 =
+                *(float3 *)&robot_spheres[4 *
+                                          ((batch_idx + l) * nspheres + max_d.j)];
+              float3 dist_vec = (sph1 - sph2) / max_d.d;
+              *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.i * 4] =
+                weight[0] * -1 * dist_vec;
+              *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.j * 4] =
+                weight[0] *  dist_vec;
+              sparse_index[(batch_idx + l) * nspheres + max_d.i] = 1;
+              sparse_index[(batch_idx + l) * nspheres + max_d.j] = 1;
+            }
+          }
+          else
+          {
+            out_distance[batch_idx + l] = 0;
           }
         }
       }
     }
-    // thread0 in the warp has the max_d for the warp
-    if (threadIdx.x % 32 == 0) {
-      max_darr[(threadIdx.x / 32) + 32 * l] = max_d[l];
-      // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
-      // blockIdx.x, max_d);
-    }
-  }
-
-  if (threadIdx.x < nspheres) {
-    for (int l = 0; l < nbpb; l++) {
-
-      if (write_grad &&
-          sparse_index[(batch_idx + l) * nspheres + threadIdx.x] != 0) {
-        *(float4 *)&out_vec[(batch_idx + l) * nspheres * 4 + threadIdx.x * 4] =
-            make_float4(0.0);
-        sparse_index[(batch_idx + l) * nspheres + threadIdx.x] = 0;
-      }
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-#pragma unroll
-    for (uint8_t l = 0; l < nbpb; l++) {
-      dist_t max_d = max_darr[l * 32];
-      // TODO: This can be parallized
-      for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
-        if (max_darr[l * 32 + i].d > max_d.d) {
-          max_d = max_darr[l * 32 + i];
-        }
-      }
-      //////////////////////////////////////////////////////
-      // Write out the final results
-      //////////////////////////////////////////////////////
-      if (max_d.d != 0.0) {
-        out_distance[batch_idx + l] = weight[0] * max_d.d;
-        if (write_grad) {
-          float3 sph1 =
-              *(float3 *)&robot_spheres[4 *
-                                        ((batch_idx + l) * nspheres + max_d.i)];
-          float3 sph2 =
-              *(float3 *)&robot_spheres[4 *
-                                        ((batch_idx + l) * nspheres + max_d.j)];
-          float3 dist_vec = (sph1 - sph2) / max_d.d;
-          *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.i * 4] =
-              weight[0] * -1 * dist_vec;
-          *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.j * 4] =
-              weight[0] *  dist_vec;
-          sparse_index[(batch_idx + l) * nspheres + max_d.i] = 1;
-          sparse_index[(batch_idx + l) * nspheres + max_d.j] = 1;
-        }
-      } else {
-        out_distance[batch_idx + l] = 0;
-      }
-    }
-  }
-}
-} // namespace Geometry
-} // namespace Curobo
+  } // namespace Geometry
+}   // namespace Curobo
 
 // This is the best version so far.
 // It precomputes the start addresses per thread on the cpu.
 // The rest is similar to the version above.
-std::vector<torch::Tensor> self_collision_distance(
-    torch::Tensor out_distance, torch::Tensor out_vec,
-    torch::Tensor sparse_index,
-    const torch::Tensor robot_spheres,    // batch_size x n_spheres x 3
-    const torch::Tensor collision_offset, // n_spheres x n_spheres
-    const torch::Tensor weight, const torch::Tensor collision_matrix,
-    const torch::Tensor thread_locations, const int thread_locations_size,
-    const int batch_size, const int nspheres, const bool compute_grad = false,
-    const int ndpt = 8, // Does this need to match template?
-    const bool experimental_kernel = false) {
+std::vector<torch::Tensor>self_collision_distance(
+  torch::Tensor out_distance, torch::Tensor out_vec,
+  torch::Tensor sparse_index,
+  const torch::Tensor robot_spheres,    // batch_size x n_spheres x 3
+  const torch::Tensor collision_offset, // n_spheres x n_spheres
+  const torch::Tensor weight, const torch::Tensor collision_matrix,
+  const torch::Tensor thread_locations, const int thread_locations_size,
+  const int batch_size, const int nspheres, const bool compute_grad = false,
+  const int ndpt                 = 8, // Does this need to match template?
+  const bool experimental_kernel = false)
+{
   using namespace Curobo::Geometry;
 
   // use efficient kernel based on number of threads:
@@ -445,165 +539,159 @@ std::vector<torch::Tensor> self_collision_distance(
 
   assert(nspheres < 1024);
 
-  
+
   int threadsPerBlock = ((thread_locations_size / 2) + ndpt - 1) /
                         ndpt; // location_size must be an even number. We store
-                              // i,j for each sphere pair.
-  //assert(threadsPerBlock/nbpb <=32);
-  if (threadsPerBlock < 32*nbpb)
+
+  // i,j for each sphere pair.
+  // assert(threadsPerBlock/nbpb <=32);
+  if (threadsPerBlock < 32 * nbpb)
   {
     threadsPerBlock = 32 * nbpb;
   }
+
   if (threadsPerBlock < nspheres)
   {
     threadsPerBlock = nspheres;
   }
-  int blocksPerGrid = (batch_size + nbpb - 1) / nbpb; // number of batches per block 
+  int blocksPerGrid   = (batch_size + nbpb - 1) / nbpb; // number of batches per block
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-   // ((threadsPerBlock >= nspheres))&& 
-  if (experimental_kernel) {
-
+  // ((threadsPerBlock >= nspheres))&&
+  if (experimental_kernel)
+  {
     int smemSize = nbpb * nspheres * sizeof(float4);
 
 
-
     if (ndpt == 1)
     {
-
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 1, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 1, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
-    
+
     else if (ndpt == 2)
     {
-
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 2, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 2, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
-    
+
     else if (ndpt == 4)
     {
-
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 4, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 4, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
     else if (ndpt == 8)
     {
-
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 8, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 8, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
     else if (ndpt == 32)
     {
-
-    
-    AT_DISPATCH_FLOATING_TYPES(
+      AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 32, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 32, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
     else if (ndpt == 64)
     {
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 64, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 64, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
     else if (ndpt == 128)
     {
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 128, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
-      
+        self_collision_distance_kernel7<scalar_t, 128, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
     else if (ndpt == 512)
     {
       AT_DISPATCH_FLOATING_TYPES(
         robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel7<scalar_t, 512, nbpb>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  weight.data_ptr<scalar_t>(),
-                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
-                  compute_grad);
-        }));
+        self_collision_distance_kernel7<scalar_t, 512, nbpb>
+          << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+          out_distance.data_ptr<scalar_t>(),
+          out_vec.data_ptr<scalar_t>(),
+          sparse_index.data_ptr<uint8_t>(),
+          robot_spheres.data_ptr<scalar_t>(),
+          collision_offset.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(),
+          thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+          compute_grad);
+      }));
     }
     else
     {
@@ -611,12 +699,14 @@ std::vector<torch::Tensor> self_collision_distance(
     }
   }
 
-  else {
-    int ndpt_n = 32; // number of distances to be computed per thread
-    int nwpr = (nspheres + 31) / 32;
+  else
+  {
+    int ndpt_n        = 32; // number of distances to be computed per thread
+    int nwpr          = (nspheres + 31) / 32;
     int warpsPerBlock = nwpr * ((nspheres + ndpt_n - 1) / ndpt_n);
     threadsPerBlock = warpsPerBlock * 32;
-    blocksPerGrid = batch_size;
+    blocksPerGrid   = batch_size;
+
     // printf("Blocks: %d, threads/block:%d, ndpt=%d, nwpr=%d\n", blocksPerGrid,
     // threadsPerBlock, ndpt, nwpr);
 
@@ -624,20 +714,19 @@ std::vector<torch::Tensor> self_collision_distance(
 
 
     AT_DISPATCH_FLOATING_TYPES(
-        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
-          self_collision_distance_kernel4<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
-                  out_distance.data_ptr<scalar_t>(),
-                  out_vec.data_ptr<scalar_t>(),
-                  robot_spheres.data_ptr<scalar_t>(),
-                  collision_offset.data_ptr<scalar_t>(),
-                  collision_matrix.data_ptr<uint8_t>(), batch_size, nspheres,
-                  ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
-                  sparse_index.data_ptr<uint8_t>(), compute_grad);
-        }));
+      robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+      self_collision_distance_kernel4<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+        out_distance.data_ptr<scalar_t>(),
+        out_vec.data_ptr<scalar_t>(),
+        robot_spheres.data_ptr<scalar_t>(),
+        collision_offset.data_ptr<scalar_t>(),
+        collision_matrix.data_ptr<uint8_t>(), batch_size, nspheres,
+        ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
+        sparse_index.data_ptr<uint8_t>(), compute_grad);
+    }));
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_distance, out_vec, sparse_index};
+  return { out_distance, out_vec, sparse_index };
 }
-
diff --git a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
index 4d172e5..ff1022f 100644
--- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
@@ -18,1550 +18,1702 @@
 #include <cuda_fp16.h>
 #include <vector>
 #define M 4
+
 // #define MAX_WARP_THREADS 512 // warp level batching. 8 x M = 32
 #define MAX_BOX_SHARED 256 // maximum number of boxes we can store distance and closest point
 #define DEBUG false
-namespace Curobo {
-namespace Geometry {
-
-/**
- * @brief Compute length of sphere
- *
- * @param v1
- * @param v2
- * @return float
- */
-__device__ __forceinline__ float sphere_length(const float4 &v1,
-                                               const float4 &v2) {
-  return norm3df(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
-}
-
-__device__ __forceinline__ float sphere_distance(const float4 &v1,
-                                                 const float4 &v2) {
-  return max(0.0f, sphere_length(v1, v2) - v1.w - v2.w);
-}
-
-template<typename scalar_t>
-__device__ __forceinline__ void load_obb_pose(const scalar_t *obb_mat,
-float3 &position, float4 &quat)
-{ // obb_mat has x,y,z, qw, qx, qy, qz, 0 with an extra 0 padding for better use of memory
-  float4 temp = *(float4 *)&obb_mat[0];
-  position.x = temp.x;
-  position.y = temp.y;
-  position.z = temp.z;
-  quat.w = temp.w;
-  temp = *(float4 *)&obb_mat[4];
-  quat.x = temp.x;
-  quat.y = temp.y;
-  quat.z = temp.z;
-}
-template<typename scalar_t>
-__device__ __forceinline__ void load_obb_bounds(const scalar_t *obb_bounds,
-float3 &bounds)
-{ // obb_bounds has x,y,z, 0 with an extra 0 padding.
-  float4 loc_bounds = *(float4 *)&obb_bounds[0];
-  bounds.x = loc_bounds.x / 2;
-  bounds.y = loc_bounds.y / 2;
-  bounds.z = loc_bounds.z / 2;
-}
-
-
-template <typename scalar_t>
-__device__ __forceinline__ void
-transform_sphere_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
-                      const float4 &sphere_pos, float4 &C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-  const float3 p_arr = *(float3 *)&transform_mat[0];
-  const scalar_t w = transform_mat[3];
-  const scalar_t x = transform_mat[4];
-  const scalar_t y = transform_mat[5];
-  const scalar_t z = transform_mat[6];
-
-  if(x!= 0 || y != 0 || z!=0)
+namespace Curobo
+{
+  namespace Geometry
   {
+    /**
+     * @brief Compute length of sphere
+     *
+     * @param v1
+     * @param v2
+     * @return float
+     */
+    __device__ __forceinline__ float sphere_length(const float4& v1,
+                                                   const float4& v2)
+    {
+      return norm3df(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
+    }
 
-  C.x = p_arr.x + w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
-        2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
-        2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
-        z * z * sphere_pos.x - y * y * sphere_pos.x;
-  C.y = p_arr.y + 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
-        2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
-        z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
-        x * x * sphere_pos.y;
-  C.z = p_arr.z + 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
-        z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
-        2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
-  }
-  else
-  {
-    C.x = p_arr.x + sphere_pos.x;
-    C.y = p_arr.y + sphere_pos.y;
-    C.z = p_arr.z + sphere_pos.z;
-  }
-  C.w = sphere_pos.w;
-}
+    __device__ __forceinline__ float sphere_distance(const float4& v1,
+                                                     const float4& v2)
+    {
+      return max(0.0f, sphere_length(v1, v2) - v1.w - v2.w);
+    }
 
+    template<typename scalar_t>
+    __device__ __forceinline__ void load_obb_pose(const scalar_t *obb_mat,
+                                                  float3& position, float4& quat)
+    { // obb_mat has x,y,z, qw, qx, qy, qz, 0 with an extra 0 padding for better use of memory
+      float4 temp = *(float4 *)&obb_mat[0];
 
-__device__ __forceinline__ void transform_sphere_quat(const float3 p, 
-                      const float4 q,
-                      const float4 &sphere_pos, 
-                      float4 &C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
+      position.x = temp.x;
+      position.y = temp.y;
+      position.z = temp.z;
+      quat.w     = temp.w;
+      temp       = *(float4 *)&obb_mat[4];
+      quat.x     = temp.x;
+      quat.y     = temp.y;
+      quat.z     = temp.z;
+    }
 
-  if(q.x!= 0 || q.y != 0 || q.z!=0)
-  {
+    template<typename scalar_t>
+    __device__ __forceinline__ void load_obb_bounds(const scalar_t *obb_bounds,
+                                                    float3        & bounds)
+    { // obb_bounds has x,y,z, 0 with an extra 0 padding.
+      float4 loc_bounds = *(float4 *)&obb_bounds[0];
 
-  C.x = p.x + q.w * q.w * sphere_pos.x + 2 * q.y * q.w * sphere_pos.z -
-        2 * q.z * q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
-        2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
-        q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
-  C.y = p.y + 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
-        2 * q.z * q.y * sphere_pos.z + 2 * q.w * q.z * sphere_pos.x -
-        q.z * q.z * sphere_pos.y + q.w * q.w * sphere_pos.y - 2 * q.x * q.w * sphere_pos.z -
-        q.x * q.x * sphere_pos.y;
-  C.z = p.z + 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
-        q.z * q.z * sphere_pos.z - 2 * q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z +
-        2 * q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z + q.w * q.w * sphere_pos.z;
-  }
-  else
-  {
-    C.x = p.x + sphere_pos.x;
-    C.y = p.y + sphere_pos.y;
-    C.z = p.z + sphere_pos.z;
-  }
-  C.w = sphere_pos.w;
-}
+      bounds.x = loc_bounds.x / 2;
+      bounds.y = loc_bounds.y / 2;
+      bounds.z = loc_bounds.z / 2;
+    }
 
+    template<typename scalar_t>
+    __device__ __forceinline__ void
+    transform_sphere_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
+                          const float4& sphere_pos, float4& C)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      const float3   p_arr = *(float3 *)&transform_mat[0];
+      const scalar_t w     = transform_mat[3];
+      const scalar_t x     = transform_mat[4];
+      const scalar_t y     = transform_mat[5];
+      const scalar_t z     = transform_mat[6];
 
-__device__ __forceinline__ void
-inv_transform_vec_quat(
-                      const float3 p, 
-                      const float4 q,
-                       const float4 &sphere_pos, float3 &C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-  if(q.x != 0 || q.y!= 0 || q.z!=0)
-  {
-  C.x =  q.w *  q.w * sphere_pos.x - 2 * q.y *  q.w * sphere_pos.z +
-        2 * q.z *  q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
-        2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
-        q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
-  C.y = 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
-        2 * q.z * q.y * sphere_pos.z - 2 *  q.w * q.z * sphere_pos.x -
-        q.z * q.z * sphere_pos.y +  q.w *  q.w * sphere_pos.y + 2 * q.x *  q.w * sphere_pos.z -
-        q.x * q.x * sphere_pos.y;
-  C.z = 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
-        q.z * q.z * sphere_pos.z + 2 *  q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z -
-        2 *  q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z +  q.w *  q.w * sphere_pos.z;
-  }
-  else
-  {
-    C.x = sphere_pos.x;
-    C.y = sphere_pos.y;
-    C.z = sphere_pos.z;
-  }
-}
+      if ((x != 0) || (y != 0) || (z != 0))
+      {
+        C.x = p_arr.x + w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
+              2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
+              2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
+              z * z * sphere_pos.x - y * y * sphere_pos.x;
+        C.y = p_arr.y + 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
+              2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
+              z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
+              x * x * sphere_pos.y;
+        C.z = p_arr.z + 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
+              z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
+              2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
+      }
+      else
+      {
+        C.x = p_arr.x + sphere_pos.x;
+        C.y = p_arr.y + sphere_pos.y;
+        C.z = p_arr.z + sphere_pos.z;
+      }
+      C.w = sphere_pos.w;
+    }
 
+    __device__ __forceinline__ void transform_sphere_quat(const float3  p,
+                                                          const float4  q,
+                                                          const float4& sphere_pos,
+                                                          float4      & C)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
 
-__device__ __forceinline__ void
-inv_transform_vec_quat_add(const float3 p, 
-                           const float4 q, // x,y,z, qw, qx,qy,qz
-                           const float4 &sphere_pos, float3 &C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-  float3 temp_C = make_float3(0.0);
-  inv_transform_vec_quat(p, q, sphere_pos, temp_C);
-  C = C + temp_C;
-}
+      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
+      {
+        C.x = p.x + q.w * q.w * sphere_pos.x + 2 * q.y * q.w * sphere_pos.z -
+              2 * q.z * q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
+              2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
+              q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
+        C.y = p.y + 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
+              2 * q.z * q.y * sphere_pos.z + 2 * q.w * q.z * sphere_pos.x -
+              q.z * q.z * sphere_pos.y + q.w * q.w * sphere_pos.y - 2 * q.x * q.w * sphere_pos.z -
+              q.x * q.x * sphere_pos.y;
+        C.z = p.z + 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
+              q.z * q.z * sphere_pos.z - 2 * q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z +
+              2 * q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z + q.w * q.w * sphere_pos.z;
+      }
+      else
+      {
+        C.x = p.x + sphere_pos.x;
+        C.y = p.y + sphere_pos.y;
+        C.z = p.z + sphere_pos.z;
+      }
+      C.w = sphere_pos.w;
+    }
 
-
-template <typename scalar_t>
-__device__ __forceinline__ void
-inv_transform_vec_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
-                       const float4 &sphere_pos, float3 &C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-  const scalar_t w = transform_mat[3];
-  const scalar_t x = -1 * transform_mat[4];
-  const scalar_t y = -1 * transform_mat[5];
-  const scalar_t z = -1 * transform_mat[6];
-  if(x != 0 || y!= 0 || z!=0)
-  {
-
-  
-  C.x = w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
-        2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
-        2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
-        z * z * sphere_pos.x - y * y * sphere_pos.x;
-  C.y = 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
-        2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
-        z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
-        x * x * sphere_pos.y;
-  C.z = 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
-        z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
-        2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
-  }
-  else
-  {
-    C.x = sphere_pos.x;
-    C.y = sphere_pos.y;
-    C.z = sphere_pos.z;
-  }
-}
-
-template <typename scalar_t>
-__device__ __forceinline__ void
-inv_transform_vec_quat_add(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
-                           const float4 &sphere_pos, float3 &C) {
-  // do dot product:
-  // new_p = q * p * q_inv + obs_p
-  const scalar_t w = transform_mat[3];
-  const scalar_t x = -1 * transform_mat[4];
-  const scalar_t y = -1 * transform_mat[5];
-  const scalar_t z = -1 * transform_mat[6];
-  if(x != 0 || y!= 0 || z!=0){
-  
-  C.x += w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
-         2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
-         2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
-         z * z * sphere_pos.x - y * y * sphere_pos.x;
-  C.y += 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
-         2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
-         z * z * sphere_pos.y + w * w * sphere_pos.y -
-         2 * x * w * sphere_pos.z - x * x * sphere_pos.y;
-  C.z += 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
-         z * z * sphere_pos.z - 2 * w * y * sphere_pos.x -
-         y * y * sphere_pos.z + 2 * w * x * sphere_pos.y -
-         x * x * sphere_pos.z + w * w * sphere_pos.z;
-  }
-  {
-    C.x += sphere_pos.x;
-    C.y += sphere_pos.y;
-    C.z += sphere_pos.z;
-  }
-}
-
-/**
- * @brief Scales the Collision across the trajectory by sphere velocity. This is
- * implemented from CHOMP motion planner (ICRA 2009). We use central difference
- * to compute the velocity and acceleration of the sphere.
- *
- * @param sphere_0_cache
- * @param sphere_1_cache
- * @param sphere_2_cache
- * @param dt
- * @param transform_back
- * @param max_dist
- * @param max_grad
- * @return void
- */
-__device__ __forceinline__ void
-scale_speed_metric(const float4 &sphere_0_cache, const float4 &sphere_1_cache,
-                   const float4 &sphere_2_cache, const float &dt,
-                   const bool &transform_back, float &max_dist,
-                   float3 &max_grad) {
-
-  float3 norm_vel_vec = make_float3(sphere_2_cache.x - sphere_0_cache.x,
-                                    sphere_2_cache.y - sphere_0_cache.y,
-                                    sphere_2_cache.z - sphere_0_cache.z);
-
-  norm_vel_vec = (0.5 / dt) * norm_vel_vec;
-  const float sph_vel = length(norm_vel_vec);
-
-  if (transform_back) {
-
-    float3 sph_acc_vec = make_float3(
-        sphere_0_cache.x + sphere_2_cache.x - 2 * sphere_1_cache.x,
-        sphere_0_cache.y + sphere_2_cache.y - 2 * sphere_1_cache.y,
-        sphere_0_cache.z + sphere_2_cache.z - 2 * sphere_1_cache.z);
-
-    sph_acc_vec = (1 / (dt * dt)) * sph_acc_vec;
-    norm_vel_vec = norm_vel_vec * (1 / sph_vel);
-
-    const float3 curvature_vec = (sph_acc_vec) / (sph_vel * sph_vel);
-
-    // compute orthogonal projection:
-    float orth_proj[9] = {0.0};
-
-    // load float3 into array for easier matmul later:
-    float vel_arr[3];
-    vel_arr[0] = norm_vel_vec.x;
-    vel_arr[1] = norm_vel_vec.y;
-    vel_arr[2] = norm_vel_vec.z;
-
-// calculate projection ( I - (v * v^T)):
-#pragma unroll 3
-    for (int i = 0; i < 3; i++) {
-#pragma unroll 3
-      for (int j = 0; j < 3; j++) {
-        orth_proj[i * 3 + j] = -1 * vel_arr[i] * vel_arr[j];
+    __device__ __forceinline__ void
+    inv_transform_vec_quat(
+      const float3 p,
+      const float4 q,
+      const float4& sphere_pos, float3& C)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
+      {
+        C.x =  q.w *  q.w * sphere_pos.x - 2 * q.y *  q.w * sphere_pos.z +
+              2 * q.z *  q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
+              2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
+              q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
+        C.y = 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
+              2 * q.z * q.y * sphere_pos.z - 2 *  q.w * q.z * sphere_pos.x -
+              q.z * q.z * sphere_pos.y +  q.w *  q.w * sphere_pos.y + 2 * q.x *  q.w *
+              sphere_pos.z -
+              q.x * q.x * sphere_pos.y;
+        C.z = 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
+              q.z * q.z * sphere_pos.z + 2 *  q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z -
+              2 *  q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z +  q.w *  q.w * sphere_pos.z;
+      }
+      else
+      {
+        C.x = sphere_pos.x;
+        C.y = sphere_pos.y;
+        C.z = sphere_pos.z;
       }
     }
-    orth_proj[0] += 1;
-    orth_proj[4] += 1;
-    orth_proj[8] += 1;
 
-    // curvature vec:
+    __device__ __forceinline__ void
+    inv_transform_vec_quat_add(const float3 p,
+                               const float4 q, // x,y,z, qw, qx,qy,qz
+                               const float4& sphere_pos, float3& C)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      float3 temp_C = make_float3(0.0);
 
-    // multiply by orth projection:
-    // two matmuls:
-    float orth_pt[3];; // orth_proj(3x3) * max_grad(3x1)
-    float orth_curve[3]; // max_dist(1) * orth_proj (3x3) * curvature_vec (3x1)
+      inv_transform_vec_quat(p, q, sphere_pos, temp_C);
+      C = C + temp_C;
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void
+    inv_transform_vec_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
+                           const float4& sphere_pos, float3& C)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      const scalar_t w = transform_mat[3];
+      const scalar_t x = -1 * transform_mat[4];
+      const scalar_t y = -1 * transform_mat[5];
+      const scalar_t z = -1 * transform_mat[6];
+
+      if ((x != 0) || (y != 0) || (z != 0))
+      {
+        C.x = w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
+              2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
+              2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
+              z * z * sphere_pos.x - y * y * sphere_pos.x;
+        C.y = 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
+              2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
+              z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
+              x * x * sphere_pos.y;
+        C.z = 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
+              z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
+              2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
+      }
+      else
+      {
+        C.x = sphere_pos.x;
+        C.y = sphere_pos.y;
+        C.z = sphere_pos.z;
+      }
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void
+    inv_transform_vec_quat_add(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
+                               const float4& sphere_pos, float3& C)
+    {
+      // do dot product:
+      // new_p = q * p * q_inv + obs_p
+      const scalar_t w = transform_mat[3];
+      const scalar_t x = -1 * transform_mat[4];
+      const scalar_t y = -1 * transform_mat[5];
+      const scalar_t z = -1 * transform_mat[6];
+
+      if ((x != 0) || (y != 0) || (z != 0))
+      {
+        C.x += w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
+               2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
+               2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
+               z * z * sphere_pos.x - y * y * sphere_pos.x;
+        C.y += 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
+               2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
+               z * z * sphere_pos.y + w * w * sphere_pos.y -
+               2 * x * w * sphere_pos.z - x * x * sphere_pos.y;
+        C.z += 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
+               z * z * sphere_pos.z - 2 * w * y * sphere_pos.x -
+               y * y * sphere_pos.z + 2 * w * x * sphere_pos.y -
+               x * x * sphere_pos.z + w * w * sphere_pos.z;
+      }
+      {
+        C.x += sphere_pos.x;
+        C.y += sphere_pos.y;
+        C.z += sphere_pos.z;
+      }
+    }
+
+    /**
+     * @brief Scales the Collision across the trajectory by sphere velocity. This is
+     * implemented from CHOMP motion planner (ICRA 2009). We use central difference
+     * to compute the velocity and acceleration of the sphere.
+     *
+     * @param sphere_0_cache
+     * @param sphere_1_cache
+     * @param sphere_2_cache
+     * @param dt
+     * @param transform_back
+     * @param max_dist
+     * @param max_grad
+     * @return void
+     */
+    __device__ __forceinline__ void
+    scale_speed_metric(const float4& sphere_0_cache, const float4& sphere_1_cache,
+                       const float4& sphere_2_cache, const float& dt,
+                       const bool& transform_back, float& max_dist,
+                       float3& max_grad)
+    {
+      float3 norm_vel_vec = make_float3(sphere_2_cache.x - sphere_0_cache.x,
+                                        sphere_2_cache.y - sphere_0_cache.y,
+                                        sphere_2_cache.z - sphere_0_cache.z);
+
+      norm_vel_vec = (0.5 / dt) * norm_vel_vec;
+      const float sph_vel = length(norm_vel_vec);
+
+      if (transform_back)
+      {
+        float3 sph_acc_vec = make_float3(
+          sphere_0_cache.x + sphere_2_cache.x - 2 * sphere_1_cache.x,
+          sphere_0_cache.y + sphere_2_cache.y - 2 * sphere_1_cache.y,
+          sphere_0_cache.z + sphere_2_cache.z - 2 * sphere_1_cache.z);
+
+        sph_acc_vec  = (1 / (dt * dt)) * sph_acc_vec;
+        norm_vel_vec = norm_vel_vec * (1 / sph_vel);
+
+        const float3 curvature_vec = (sph_acc_vec) / (sph_vel * sph_vel);
+
+        // compute orthogonal projection:
+        float orth_proj[9] = { 0.0 };
+
+        // load float3 into array for easier matmul later:
+        float vel_arr[3];
+        vel_arr[0] = norm_vel_vec.x;
+        vel_arr[1] = norm_vel_vec.y;
+        vel_arr[2] = norm_vel_vec.z;
+
+        // calculate projection ( I - (v * v^T)):
+#pragma unroll 3
+
+        for (int i = 0; i < 3; i++)
+        {
+#pragma unroll 3
+
+          for (int j = 0; j < 3; j++)
+          {
+            orth_proj[i * 3 + j] = -1 * vel_arr[i] * vel_arr[j];
+          }
+        }
+        orth_proj[0] += 1;
+        orth_proj[4] += 1;
+        orth_proj[8] += 1;
+
+        // curvature vec:
+
+        // multiply by orth projection:
+        // two matmuls:
+        float orth_pt[3];    // orth_proj(3x3) * max_grad(3x1)
+        float orth_curve[3]; // max_dist(1) * orth_proj (3x3) * curvature_vec (3x1)
 
 #pragma unroll 3
-    for (int i = 0; i < 3; i++) // matrix - vector product
+
+        for (int i = 0; i < 3; i++) // matrix - vector product
+        {
+          orth_pt[i] = orth_proj[i * 3 + 0] * max_grad.x +
+                       orth_proj[i * 3 + 1] * max_grad.y +
+                       orth_proj[i * 3 + 2] * max_grad.z;
+
+          orth_curve[i] = max_dist * (orth_proj[i * 3 + 0] * curvature_vec.x +
+                                      orth_proj[i * 3 + 1] * curvature_vec.y +
+                                      orth_proj[i * 3 + 2] * curvature_vec.z);
+        }
+
+        // max_grad =  sph_vel * ((orth_proj * max_grad) - max_dist *  orth_proj *
+        // curvature)
+
+        max_grad.x = sph_vel * (orth_pt[0] - orth_curve[0]); // orth_proj[0];// * (orth_pt[0] -
+                                                             // orth_curve[0]);
+        max_grad.y = sph_vel * (orth_pt[1] - orth_curve[1]);
+        max_grad.z = sph_vel * (orth_pt[2] - orth_curve[2]);
+      }
+      max_dist = sph_vel * max_dist;
+    }
+
+    //
+
+    /**
+     * @brief check if sphere is inside. For numerical stability, we assume that if
+     * sphere is exactly at bound of cuboid, we are not in collision. Note: this is
+     * not warp safe.
+     *
+     * @param bounds bounds of cuboid
+     * @param sphere sphere as float4 (in cuboid frame of reference)
+     * @return bool
+     */
+    __device__ __forceinline__ bool
+    check_sphere_aabb(const float3 bounds, const float4 sphere)
     {
-      orth_pt[i] = orth_proj[i * 3 + 0] * max_grad.x +
-                    orth_proj[i * 3 + 1] * max_grad.y +
-                    orth_proj[i * 3 + 2] * max_grad.z;
-
-      orth_curve[i] = max_dist * (orth_proj[i * 3 + 0] * curvature_vec.x +
-                                   orth_proj[i * 3 + 1] * curvature_vec.y +
-                                   orth_proj[i * 3 + 2] * curvature_vec.z);
+      // if((fabs(sphere.x) - bounds.x) >= sphere.w || fabs(sphere.y) - bounds.y >=
+      // sphere.w || (fabs(sphere.z) - bounds.z) >= sphere.w)
+      if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y),
+              fabs(sphere.z) - bounds.z) >= (sphere.w))
+      {
+        return false;
+      }
+      return true;
     }
 
-    // max_grad =  sph_vel * ((orth_proj * max_grad) - max_dist *  orth_proj *
-    // curvature)
-
-    max_grad.x =sph_vel * (orth_pt[0] - orth_curve[0]); ;//orth_proj[0];// * (orth_pt[0] - orth_curve[0]);
-    max_grad.y = sph_vel * (orth_pt[1] - orth_curve[1]);
-    max_grad.z = sph_vel * (orth_pt[2] - orth_curve[2]);
-  }
-  max_dist = sph_vel * max_dist;
-}
-
-//
-/**
- * @brief check if sphere is inside. For numerical stability, we assume that if
- * sphere is exactly at bound of cuboid, we are not in collision. Note: this is
- * not warp safe.
- *
- * @param bounds bounds of cuboid
- * @param sphere sphere as float4 (in cuboid frame of reference)
- * @return bool
- */
-__device__ __forceinline__ bool
-check_sphere_aabb(const float3 bounds, const float4 sphere) {
-  // if((fabs(sphere.x) - bounds.x) >= sphere.w || fabs(sphere.y) - bounds.y >=
-  // sphere.w || (fabs(sphere.z) - bounds.z) >= sphere.w)
-  if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y),
-          fabs(sphere.z) - bounds.z) >= (sphere.w)) {
-    return false;
-  }
-  return true;
-}
-
-__device__ __forceinline__ void
-compute_closest_point(const float3 &bounds, const float4 &sphere, float4 &pt) {
-  // assuming the cuboid is centered at origin
-  // Find the closest face to the sphere position:
-  // If sphere is within bounds of obstacle,
-  float min_val, curr_val;
-  int min_idx;
-
-  // We check the distance to each face and store the closest face
-  // All we want is the index of the closest face
-  min_val = bounds.x - fabsf(sphere.x);
-
-  if (min_val < 0) // it's outside limits, clamp:
-  {
-
-    pt.x = copysignf(bounds.x, sphere.x);
-  }
-  // check if bounds.x - sphere.x > bounds.x + sphere.x
-  min_val = fabsf(min_val);
-
-  if (sphere.x > 0) {
-    min_idx = 0;
-  } else {
-    min_idx = 1;
-  }
-
-  curr_val = bounds.y - fabsf(sphere.y); // check if sphere-y is outside y-lim
-  if (curr_val < 0) { // outside obb-y, we clamp point to bounds
-    pt.y = copysignf(bounds.y, sphere.y);
-  }
-  curr_val = fabsf(curr_val);
-  if (curr_val < min_val) {
-    min_val = curr_val;
-    if (sphere.y > 0) {
-      min_idx = 2;
-    } else {
-      min_idx = 3;
-    }
-  }
-  curr_val = bounds.z - fabsf(sphere.z); // distance to -ve bound
-  if (curr_val < 0) {
-    pt.y = copysignf(bounds.z, sphere.z);
-  }
-  curr_val = fabsf(curr_val);
-  if (curr_val < min_val) {
-    min_val = curr_val;
-    if (sphere.z > 0) {
-      min_idx = 4;
-    } else {
-      min_idx = 5;
-    }
-  }
-
-  // we move pt's value to bounds on the closest face dimension:
-  switch (min_idx) {
-  case 0:
-    pt.x = bounds.x;
-    break;
-  case 1:
-    pt.x = -1 * bounds.x;
-    break;
-  case 2:
-    pt.y = bounds.y;
-    break;
-  case 3:
-    pt.y = -1 * bounds.y;
-    break;
-  case 4:
-    pt.z = bounds.z;
-    break;
-  case 5:
-    pt.z = -1 * bounds.z;
-    break;
-  default:
-    break;
-  }
-}
-
-__device__ __forceinline__ float
-compute_distance(const float3 &bounds_w_radius,
-                 const float4 &sphere) { // pass in cl_pt
-  // compute closest point:
-  float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0);
-  compute_closest_point(bounds_w_radius, sphere, cl_pt);
-  // clamp:
-
-  // compute distance:
-  return sphere_length(sphere, cl_pt); // cl_pt includes radius, and sphere also has radius
-}
-
-__device__ __forceinline__ void scale_eta_metric(const float4 &sphere, const float4 &cl_pt, 
-const float eta,
-float4 &sum_pt)
-{
-    // compute distance:
-    float sph_dist = 0;
-    sph_dist = sphere_length(sphere, cl_pt);
-   
-    if (sph_dist == 0)
+    __device__ __forceinline__ void
+    compute_closest_point(const float3& bounds, const float4& sphere, float4& pt)
     {
-      sum_pt.x = 0;
-      sum_pt.y = 0;
-      sum_pt.z = 0;
-      sum_pt.w = 0;
-      
-      return;
+      // assuming the cuboid is centered at origin
+      // Find the closest face to the sphere position:
+      // If sphere is within bounds of obstacle,
+      float min_val, curr_val;
+      int   min_idx;
+
+      // We check the distance to each face and store the closest face
+      // All we want is the index of the closest face
+      min_val = bounds.x - fabsf(sphere.x);
+
+      if (min_val < 0) // it's outside limits, clamp:
+      {
+        pt.x = copysignf(bounds.x, sphere.x);
+      }
+
+      // check if bounds.x - sphere.x > bounds.x + sphere.x
+      min_val = fabsf(min_val);
+
+      if (sphere.x > 0)
+      {
+        min_idx = 0;
+      }
+      else
+      {
+        min_idx = 1;
+      }
+
+      curr_val = bounds.y - fabsf(sphere.y); // check if sphere-y is outside y-lim
+
+      if (curr_val < 0)                      // outside obb-y, we clamp point to bounds
+      {
+        pt.y = copysignf(bounds.y, sphere.y);
+      }
+      curr_val = fabsf(curr_val);
+
+      if (curr_val < min_val)
+      {
+        min_val = curr_val;
+
+        if (sphere.y > 0)
+        {
+          min_idx = 2;
+        }
+        else
+        {
+          min_idx = 3;
+        }
+      }
+      curr_val = bounds.z - fabsf(sphere.z); // distance to -ve bound
+
+      if (curr_val < 0)
+      {
+        pt.y = copysignf(bounds.z, sphere.z);
+      }
+      curr_val = fabsf(curr_val);
+
+      if (curr_val < min_val)
+      {
+        min_val = curr_val;
+
+        if (sphere.z > 0)
+        {
+          min_idx = 4;
+        }
+        else
+        {
+          min_idx = 5;
+        }
+      }
+
+      // we move pt's value to bounds on the closest face dimension:
+      switch (min_idx)
+      {
+      case 0:
+        pt.x = bounds.x;
+        break;
+
+      case 1:
+        pt.x = -1 * bounds.x;
+        break;
+
+      case 2:
+        pt.y = bounds.y;
+        break;
+
+      case 3:
+        pt.y = -1 * bounds.y;
+        break;
+
+      case 4:
+        pt.z = bounds.z;
+        break;
+
+      case 5:
+        pt.z = -1 * bounds.z;
+        break;
+
+      default:
+        break;
+      }
     }
-    sum_pt.x = (sphere.x - cl_pt.x) / sph_dist;
-    sum_pt.y = (sphere.y - cl_pt.y) / sph_dist;
-    sum_pt.z = (sphere.z - cl_pt.z) / sph_dist;
-    if (eta > 0.0)
+
+    __device__ __forceinline__ float
+    compute_distance(const float3& bounds_w_radius,
+                     const float4& sphere) // pass in cl_pt
+    {// compute closest point:
+      float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0);
+
+      compute_closest_point(bounds_w_radius, sphere, cl_pt);
+
+      // clamp:
+
+      // compute distance:
+      return sphere_length(sphere, cl_pt); // cl_pt includes radius, and sphere also has radius
+    }
+
+    __device__ __forceinline__ void scale_eta_metric(const float4& sphere, const float4& cl_pt,
+                                                     const float eta,
+                                                     float4& sum_pt)
     {
+      // compute distance:
+      float sph_dist = 0;
 
-      //sph_dist = sph_dist - eta;
+      sph_dist = sphere_length(sphere, cl_pt);
 
-    if (sph_dist > eta) {
-      sum_pt.w = sph_dist - 0.5 * eta;
-    } else if (sph_dist <= eta) {
-      
-      sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist);
-      const float scale = (1 / eta) * (sph_dist);
-      sum_pt.x = scale * sum_pt.x;
-      sum_pt.y = scale * sum_pt.y;
-      sum_pt.z = scale * sum_pt.z;
+      if (sph_dist == 0)
+      {
+        sum_pt.x = 0;
+        sum_pt.y = 0;
+        sum_pt.z = 0;
+        sum_pt.w = 0;
+
+        return;
+      }
+      sum_pt.x = (sphere.x - cl_pt.x) / sph_dist;
+      sum_pt.y = (sphere.y - cl_pt.y) / sph_dist;
+      sum_pt.z = (sphere.z - cl_pt.z) / sph_dist;
+
+      if (eta > 0.0)
+      {
+        // sph_dist = sph_dist - eta;
+
+        if (sph_dist > eta)
+        {
+          sum_pt.w = sph_dist - 0.5 * eta;
+        }
+        else if (sph_dist <= eta)
+        {
+          sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist);
+          const float scale = (1 / eta) * (sph_dist);
+          sum_pt.x = scale * sum_pt.x;
+          sum_pt.y = scale * sum_pt.y;
+          sum_pt.z = scale * sum_pt.z;
+        }
+      }
+      else
+      {
+        sum_pt.w = sph_dist;
+      }
     }
 
-    }
-    else
+    /**
+     * @brief Compute distance and gradient, with a smooth l2 loss when distance >0
+     * and < eta
+     *
+     * @param bounds_w_radius
+     * @param sphere
+     * @param sum_pt
+     * @param eta
+     * @return __device__
+     */
+    __device__ __forceinline__ void
+    compute_sphere_gradient(const float3& bounds_w_radius, const float4& sphere,
+                            float4& sum_pt, const float eta = 0.0)
     {
-      sum_pt.w = sph_dist;
-    }
-}
-
-/**
- * @brief Compute distance and gradient, with a smooth l2 loss when distance >0
- * and < eta
- *
- * @param bounds_w_radius
- * @param sphere
- * @param sum_pt
- * @param eta
- * @return __device__
- */
-__device__ __forceinline__ void
-compute_sphere_gradient(const float3 &bounds_w_radius, const float4 &sphere,
-                        float4 &sum_pt, const float eta = 0.0) {
-  // compute closest point:
-  float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0);
-  compute_closest_point(bounds_w_radius, sphere, cl_pt);
-
-  scale_eta_metric(sphere, cl_pt, eta, sum_pt);
-}
-
-__device__ __forceinline__ void
-compute_sphere_gradient_add(const float3 &bounds_w_radius, const float4 &sphere,
-                            float4 &sum_pt, const float k0 = 1.0,
-                            const float eta = 0.0) {
-  // compute closest point:
-  float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0);
-  compute_closest_point(bounds_w_radius, sphere, cl_pt);
-  // We need to check if sphere is colliding or outside
-  // if distance > 0.0, then it's colliding
-  // We can achieve this by adding eta to bounds_w_radius and then subtracting
-  // it
-  float4 local_sum_pt = make_float4(0.0,0.0,0.0,0.0);
-  scale_eta_metric(sphere, cl_pt, eta, local_sum_pt);
-  sum_pt += local_sum_pt;
-}
-
-__device__ __forceinline__ void check_jump_distance(
-    const float4 &loc_sphere_1, const float4 loc_sphere_0, const float k0,
-    const float eta, const float3 &loc_bounds, const float3 &grad_loc_bounds,
-    float4 &sum_pt,
-    float &curr_jump_distance) // we can pass in interpolated sphere here, also
-                               // need to pass cl_pt for use in
-                               // compute_sphere_gradient & compute_distance
-{
-  const float4 interpolated_sphere =
-      (k0)*loc_sphere_1 + (1 - k0) * loc_sphere_0;
-
-  if (check_sphere_aabb(loc_bounds, interpolated_sphere)) {
-
-    compute_sphere_gradient_add(grad_loc_bounds, interpolated_sphere, sum_pt,
-                                k0, eta); // true, loc_sphere_1 works better
-    curr_jump_distance +=  loc_sphere_1.w; // move by diameter of sphere
-  } else {
-    const float dist = compute_distance(grad_loc_bounds, interpolated_sphere);
-    curr_jump_distance +=  max(dist - 2*loc_sphere_1.w, loc_sphere_1.w); 
-  }
-}
-
-///////////////////////////////////////////////////////////////
-// We write out the distance and gradients for all the spheres.
-// So we do not need to initize the output tensor to 0.
-// Each thread computes the max distance and gradients per sphere.
-// This version should be faster if we have enough spheres/threads
-// to fill the GPU as it avoid inter-thread communication and the
-// use of shared memory.
-///////////////////////////////////////////////////////////////
-
-template <typename scalar_t>
-__device__ __forceinline__ void sphere_obb_collision_fn(
-    const scalar_t *sphere_position, 
-    const int env_idx, 
-    const int bn_sph_idx,
-    const int sph_idx,
-    scalar_t *out_distance, const scalar_t *weight,
-    const scalar_t *activation_distance, const scalar_t *obb_accel,
-    const scalar_t *obb_bounds, const scalar_t *obb_mat,
-    const uint8_t *obb_enable, const int max_nobs, const int nboxes) {
-  float max_dist = 0;
-  const int start_box_idx = max_nobs * env_idx;
-  const float eta = activation_distance[0];
-  // Load sphere_position input
-  float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
-  if (sphere_cache.w <= 0.0) {
-      // write zeros for cost:
-    out_distance[bn_sph_idx] = 0;
-  
-    return;
-  }
-  sphere_cache.w += eta;
-
-  float4 loc_sphere = make_float4(0.0);
-  float4 obb_quat = make_float4(0.0);
-  float3 obb_pos = make_float3(0.0);
-  float3 loc_bounds = make_float3(0.0);
-  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
-    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
-    {
-      continue;
-    }
-     load_obb_pose(&obb_mat[(start_box_idx + box_idx)*8], obb_pos,
-    obb_quat);
-    load_obb_bounds(&obb_bounds[(start_box_idx + box_idx)*4], loc_bounds);
-
-    transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
-    
-
-    // first check if point is inside or outside box:
-
-    if (check_sphere_aabb(loc_bounds, loc_sphere)) {
-      max_dist = 1;
-      break; // we exit without checking other cuboids if we found a collision.
-    }
-  }
-  out_distance[bn_sph_idx] = weight[0] * max_dist;
-}
-template <typename scalar_t>
-__device__ __forceinline__ void sphere_obb_distance_fn(
-    const scalar_t *sphere_position, 
-    const int32_t env_idx, 
-    const int bn_sph_idx,
-    const int sph_idx, 
-    scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx,
-    const scalar_t *weight, const scalar_t *activation_distance,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs,
-    const int nboxes, const bool transform_back) {
-  float max_dist = 0.0;
-  const float eta = activation_distance[0];
-  float3 max_grad = make_float3(0.0, 0.0, 0.0);
-
-  // Load sphere_position input
-  float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
-  if (sphere_cache.w <= 0.0) {
-    // write zeros for cost:
-    out_distance[bn_sph_idx] = 0;
-
-    // write zeros for gradient if not zero:
-    if (sparsity_idx[bn_sph_idx] !=0) {
-      sparsity_idx[bn_sph_idx] = 0;
-      *(float4 *)& closest_pt[bn_sph_idx * 4] = make_float4(0.0);
-    }
-    return;
-  }
-  sphere_cache.w += eta;
-  const int start_box_idx = max_nobs * env_idx;
-
-  float4 loc_sphere = make_float4(0.0);
-  float4 obb_quat = make_float4(0.0);
-  float3 obb_pos = make_float3(0.0);
-  float3 loc_bounds = make_float3(0.0);
-
-  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
-    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
-    {
-      continue;
-    }
-    
-    load_obb_pose(&obb_mat[(start_box_idx + box_idx)*8], obb_pos,
-    obb_quat);
-    load_obb_bounds(&obb_bounds[(start_box_idx + box_idx)*4], loc_bounds);
-
-    transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
-    
-    // first check if point is inside or outside box:
-    if (check_sphere_aabb(loc_bounds, loc_sphere)) {
       // compute closest point:
-      loc_bounds = loc_bounds + loc_sphere.w;
+      float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0);
 
-      // using same primitive functions:
-      float4 cl;
-      compute_sphere_gradient(loc_bounds, loc_sphere, cl, eta);
+      compute_closest_point(bounds_w_radius, sphere, cl_pt);
 
-      max_dist += cl.w;
-      if (transform_back) {
-        inv_transform_vec_quat(obb_pos, obb_quat, cl, max_grad);
-
-        //inv_transform_vec_quat_add(&obb_mat[(start_box_idx + box_idx) * 7], cl,
-        //                           max_grad);
-      }
+      scale_eta_metric(sphere, cl_pt, eta, sum_pt);
     }
-  }
 
-  // sparsity opt:
-  if (max_dist == 0) {
-    if (sparsity_idx[bn_sph_idx] == 0) {
-      return;
-    }
-    sparsity_idx[bn_sph_idx] = 0;
-    if (transform_back) {
-      *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; // max_grad is all zeros
-    }
-    out_distance[bn_sph_idx] = 0.0;
-    return;
-  }
-  // else max_dist != 0
-  max_dist = weight[0] * max_dist;
-
-  if (transform_back) {
-    *(float3 *)&closest_pt[bn_sph_idx * 4] = weight[0] * max_grad;
-  }
-  out_distance[bn_sph_idx] = max_dist;
-  sparsity_idx[bn_sph_idx] = 1;
-}
-
-template <typename scalar_t, bool enable_speed_metric>
-__device__ __forceinline__ void swept_sphere_obb_distance_fn(
-    const scalar_t *sphere_position,
-    const int env_idx, const int b_idx,
-    const int h_idx, const int sph_idx, 
-    scalar_t *out_distance,
-    scalar_t *closest_pt, 
-    uint8_t *sparsity_idx, 
-    const scalar_t *weight,
-    const scalar_t *activation_distance, const scalar_t *speed_dt,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_mat, 
-    const uint8_t *obb_enable, 
-    const int max_nobs,
-    const int nboxes, const int batch_size, const int horizon,
-    const int nspheres, const int sweep_steps, 
-    const bool transform_back) {
-  // create shared memory to do warp wide reductions:
-  // warp wide reductions should only happen across nspheres in same batch and horizon
-  // 
-  // extern __shared__ float psum[]; 
-
-  const int sw_steps = sweep_steps;
-  const float fl_sw_steps = sw_steps;
-  const int start_box_idx = max_nobs * env_idx;
-  const int b_addrs =
-      b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
-
-  // We read the same obstacles across
-
-  // Load sphere_position input
-  // if h_idx == horizon -1, we just read the same index
-  const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;
-
-  float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx*4];
-  
-
-  
-  if (sphere_1_cache.w <= 0.0) {
-      // write zeros for cost:
-    out_distance[bhs_idx] = 0;
-
-    // write zeros for gradient if not zero:
-    if (sparsity_idx[bhs_idx] !=0) {
-      sparsity_idx[b_addrs + h_idx * nspheres + sph_idx] = 0;
-      *(float4 *)& closest_pt[bhs_idx * 4] = make_float4(0.0);
-      
-    }
-  
-    return;
-  }
-  bool sweep_back = false;
-  bool sweep_fwd = false;
-  float sphere_0_distance, sphere_2_distance, sphere_0_len, sphere_2_len;
-  
-  float max_dist = 0.0;
-  float3 max_grad = make_float3(0.0, 0.0, 0.0);
- 
-  const float dt = speed_dt[0];
-  const float eta = activation_distance[0];
-  sphere_1_cache.w += eta;
-  float4 sphere_0_cache, sphere_2_cache;
-  if (h_idx > 0) {
-    sphere_0_cache = *(float4 *)&sphere_position[b_addrs*4 + (h_idx - 1) * nspheres * 4 + sph_idx * 4];
-    sphere_0_cache.w = sphere_1_cache.w;
-    sphere_0_distance = sphere_distance(sphere_0_cache, sphere_1_cache);
-    sphere_0_len = sphere_0_distance + sphere_0_cache.w * 2;
-    if (sphere_0_distance > 0.0) {
-      sweep_back = true;
-    }
-  }
-
-  if (h_idx < horizon - 1) {
-    sphere_2_cache = *(float4 *)&sphere_position[b_addrs*4 + (h_idx + 1) * nspheres * 4 + sph_idx * 4];
-    sphere_2_cache.w = sphere_1_cache.w;
-    sphere_2_distance = sphere_distance(sphere_2_cache, sphere_1_cache);
-    sphere_2_len = sphere_2_distance + sphere_2_cache.w * 2;
-    if (sphere_2_distance > 0.0) {
-      sweep_fwd = true;
-    }
-  }
-  float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
-  float k0 = 0.0;
-
-
-  //float4 loc_sphere = make_float4(0.0);
-  float4 obb_quat = make_float4(0.0);
-  float3 obb_pos = make_float3(0.0);
-  float3 loc_bounds = make_float3(0.0);
-
-  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
-    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+    __device__ __forceinline__ void
+    compute_sphere_gradient_add(const float3& bounds_w_radius, const float4& sphere,
+                                float4& sum_pt, const float k0 = 1.0,
+                                const float eta = 0.0)
     {
-      continue;
+      // compute closest point:
+      float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0);
+
+      compute_closest_point(bounds_w_radius, sphere, cl_pt);
+
+      // We need to check if sphere is colliding or outside
+      // if distance > 0.0, then it's colliding
+      // We can achieve this by adding eta to bounds_w_radius and then subtracting
+      // it
+      float4 local_sum_pt = make_float4(0.0, 0.0, 0.0, 0.0);
+      scale_eta_metric(sphere, cl_pt, eta, local_sum_pt);
+      sum_pt += local_sum_pt;
     }
 
-    // read position and quaternion:
-    load_obb_pose(&obb_mat[(start_box_idx + box_idx)*8], obb_pos,
-    obb_quat);
-    load_obb_bounds(&obb_bounds[(start_box_idx + box_idx)*4], loc_bounds);
-    float curr_jump_distance = 0.0;
-   
-    const float3 grad_loc_bounds = loc_bounds + sphere_1_cache.w; // assuming sphere radius doesn't change
-
-    transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere_1);
-
-    
-
-    // assuming sphere position is in box frame:
-    // read data:
-    float4 sum_pt = make_float4(0.0, 0.0, 0.0, 0.0);
-
-    // check at exact timestep:
-    if (check_sphere_aabb(loc_bounds, loc_sphere_1)) {
-      compute_sphere_gradient(grad_loc_bounds, loc_sphere_1, sum_pt, eta);
-      curr_jump_distance = loc_sphere_1.w; // TODO: check if this is better
-    } else if (sweep_back || sweep_fwd) {
-      // there is no collision, compute the distance to obstacle:
-      curr_jump_distance = compute_distance(grad_loc_bounds, loc_sphere_1) -
-                           loc_sphere_1.w;// - eta;
-    }
-    const float jump_mid_distance = curr_jump_distance;
-    // compute distance between sweep spheres:
-    if (sweep_back && jump_mid_distance < sphere_0_distance / 2) {
-      transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0);
-
-      // get unit vector:
-      // loc_sphere_0 = (loc_sphere_0 - loc_sphere_1)/(sphere_0_len);
-
-      // loop over sweep steps and accumulate distance:
-      for (int j = 0; j < sw_steps; j++) {
-        // jump by current jump distance:
-
-        // when sweep_steps == 0, then we only check at loc_sphere_1.
-        // do interpolation from t=1 to t=0 (sweep backward)
-
-        if (curr_jump_distance >= (sphere_0_len / 2)) {
-          break;
-        }
-        k0 = 1 - (curr_jump_distance / sphere_0_len);
-        check_jump_distance(loc_sphere_1, loc_sphere_0, k0, eta, loc_bounds,
-                            grad_loc_bounds, sum_pt, curr_jump_distance);
-      }
-    }
-    if (sweep_fwd && jump_mid_distance < (sphere_2_len / 2)) {
-      curr_jump_distance = jump_mid_distance;
-      transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2);
-
-
-      for (int j = 0; j < sw_steps; j++) {
-
-        if (curr_jump_distance >= (sphere_2_len / 2)) {
-          break;
-        }
-        k0 = 1 - curr_jump_distance / sphere_2_len;
-        check_jump_distance(loc_sphere_1, loc_sphere_2, k0, eta, loc_bounds,
-                            grad_loc_bounds, sum_pt, curr_jump_distance);
-      }
-    }
-    if (sum_pt.w > 0) {
-      max_dist += sum_pt.w;
-
-      // transform point back if required:
-      if (transform_back) {
-        inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad);
-      }
-      // break;// break after first obstacle collision
-    }
-  }
-
-
-  // sparsity opt:
-  if (max_dist == 0) {
-    if (sparsity_idx[bhs_idx] == 0) {
-      return;
-    }
-    sparsity_idx[bhs_idx] = 0;
-    if (transform_back) {
-      *(float3 *)&closest_pt[bhs_idx * 4] = max_grad; // max_grad is all zeros
-    }
-    out_distance[bhs_idx] = 0.0;
-    return;
-  }
-
-  // computer speed metric here:
-  if (enable_speed_metric)
-  {
-  if (sweep_back && sweep_fwd) {
-    scale_speed_metric(sphere_0_cache, sphere_1_cache, sphere_2_cache, dt,
-                       transform_back, max_dist, max_grad);
-  }
-  }
-  max_dist = weight[0] * max_dist;
-
-  if (transform_back) {
-    *(float3 *)&closest_pt[bhs_idx * 4] = weight[0] * max_grad;
-  }
-  sparsity_idx[bhs_idx] = 1;
-
-  out_distance[bhs_idx] = max_dist;
-}
-
-/**
- * @brief Swept Collision checking. Note: This function currently does not
- * implement skipping computation based on distance (which is done in
- * swept_sphere_obb_distance_fn).
- *
- * @tparam scalar_t
- * @param sphere_position
- * @param env_idx
- * @param b_idx
- * @param h_idx
- * @param sph_idx
- * @param out_distance
- * @param weight
- * @param activation_distance
- * @param obb_accel
- * @param obb_bounds
- * @param obb_mat
- * @param obb_enable
- * @param max_nobs
- * @param nboxes
- * @param batch_size
- * @param horizon
- * @param nspheres
- * @param sweep_steps
- * @return __device__
- */
-template <typename scalar_t>
-__device__ __forceinline__ void swept_sphere_obb_collision_fn(
-    const scalar_t *sphere_position, 
-    const int env_idx, const int b_idx,
-    const int h_idx, const int sph_idx, scalar_t *out_distance,
-    const scalar_t *weight, const scalar_t *activation_distance,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs,
-    const int nboxes, const int batch_size, const int horizon,
-    const int nspheres, const int sweep_steps) {
-
-  const int sw_steps = sweep_steps;
-  const float fl_sw_steps = 2 * sw_steps + 1;
-  float max_dist = 0.0;
-  const float eta = activation_distance[0];
-  const int b_addrs =
-      b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
-  const int start_box_idx = max_nobs * env_idx;
-  const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;
-
-  // We read the same obstacles across
-
-  // Load sphere_position input
-  // if h_idx == horizon -1, we just read the same index
-  float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx*4];
-  if (sphere_1_cache.w <= 0.0) {
-    out_distance[b_addrs + h_idx * nspheres + sph_idx] = 0.0;
-    return;
-  }
-  sphere_1_cache.w += eta;
-
-  float4 sphere_0_cache, sphere_2_cache;
-  if (h_idx > 0) {
-    sphere_0_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 +
-                                   sph_idx * 4];
-    sphere_0_cache.w += eta;
-  }
-
-  if (h_idx < horizon - 1) {
-    sphere_2_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 +
-                                  sph_idx * 4];
-    sphere_2_cache.w += eta;
-  }
-  float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
-  float4 interpolated_sphere;
-  float k0, k1;
-  float in_obb_mat[7];
-  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
-    // read position and quaternion:
-    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+    __device__ __forceinline__ void check_jump_distance(
+      const float4& loc_sphere_1, const float4 loc_sphere_0, const float k0,
+      const float eta, const float3& loc_bounds, const float3& grad_loc_bounds,
+      float4& sum_pt,
+      float& curr_jump_distance) // we can pass in interpolated sphere here, also
+                                 // need to pass cl_pt for use in
+                                 // compute_sphere_gradient & compute_distance
     {
-      continue;
+      const float4 interpolated_sphere =
+        (k0) * loc_sphere_1 + (1 - k0) * loc_sphere_0;
+
+      if (check_sphere_aabb(loc_bounds, interpolated_sphere))
+      {
+        compute_sphere_gradient_add(grad_loc_bounds, interpolated_sphere, sum_pt,
+                                    k0, eta);  // true, loc_sphere_1 works better
+        curr_jump_distance +=  loc_sphere_1.w; // move by diameter of sphere
+      }
+      else
+      {
+        const float dist = compute_distance(grad_loc_bounds, interpolated_sphere);
+        curr_jump_distance +=  max(dist - 2 * loc_sphere_1.w, loc_sphere_1.w);
+      }
     }
 
+    ///////////////////////////////////////////////////////////////
+    // We write out the distance and gradients for all the spheres.
+    // So we do not need to initize the output tensor to 0.
+    // Each thread computes the max distance and gradients per sphere.
+    // This version should be faster if we have enough spheres/threads
+    // to fill the GPU as it avoid inter-thread communication and the
+    // use of shared memory.
+    ///////////////////////////////////////////////////////////////
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void sphere_obb_collision_fn(
+      const scalar_t *sphere_position,
+      const int env_idx,
+      const int bn_sph_idx,
+      const int sph_idx,
+      scalar_t *out_distance, const scalar_t *weight,
+      const scalar_t *activation_distance, const scalar_t *obb_accel,
+      const scalar_t *obb_bounds, const scalar_t *obb_mat,
+      const uint8_t *obb_enable, const int max_nobs, const int nboxes)
+    {
+      float max_dist            = 0;
+      const int   start_box_idx = max_nobs * env_idx;
+      const float eta           = activation_distance[0];
+
+      // Load sphere_position input
+      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
+
+      if (sphere_cache.w <= 0.0)
+      {
+        // write zeros for cost:
+        out_distance[bn_sph_idx] = 0;
+
+        return;
+      }
+      sphere_cache.w += eta;
+
+      float4 loc_sphere = make_float4(0.0);
+      float4 obb_quat   = make_float4(0.0);
+      float3 obb_pos    = make_float3(0.0);
+      float3 loc_bounds = make_float3(0.0);
+
+      for (int box_idx = 0; box_idx < nboxes; box_idx++)
+      {
+        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+        {
+          continue;
+        }
+        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
+                      obb_quat);
+        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);
+
+        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
+
+
+        // first check if point is inside or outside box:
+
+        if (check_sphere_aabb(loc_bounds, loc_sphere))
+        {
+          max_dist = 1;
+          break; // we exit without checking other cuboids if we found a collision.
+        }
+      }
+      out_distance[bn_sph_idx] = weight[0] * max_dist;
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void sphere_obb_distance_fn(
+      const scalar_t *sphere_position,
+      const int32_t env_idx,
+      const int bn_sph_idx,
+      const int sph_idx,
+      scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx,
+      const scalar_t *weight, const scalar_t *activation_distance,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs,
+      const int nboxes, const bool transform_back)
+    {
+      float max_dist  = 0.0;
+      const float eta = activation_distance[0];
+      float3 max_grad = make_float3(0.0, 0.0, 0.0);
+
+      // Load sphere_position input
+      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
+
+      if (sphere_cache.w <= 0.0)
+      {
+        // write zeros for cost:
+        out_distance[bn_sph_idx] = 0;
+
+        // write zeros for gradient if not zero:
+        if (sparsity_idx[bn_sph_idx] != 0)
+        {
+          sparsity_idx[bn_sph_idx]               = 0;
+          *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0);
+        }
+        return;
+      }
+      sphere_cache.w += eta;
+      const int start_box_idx = max_nobs * env_idx;
+
+      float4 loc_sphere = make_float4(0.0);
+      float4 obb_quat   = make_float4(0.0);
+      float3 obb_pos    = make_float3(0.0);
+      float3 loc_bounds = make_float3(0.0);
+
+      for (int box_idx = 0; box_idx < nboxes; box_idx++)
+      {
+        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+        {
+          continue;
+        }
+
+        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
+                      obb_quat);
+        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);
+
+        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
+
+        // first check if point is inside or outside box:
+        if (check_sphere_aabb(loc_bounds, loc_sphere))
+        {
+          // compute closest point:
+          loc_bounds = loc_bounds + loc_sphere.w;
+
+          // using same primitive functions:
+          float4 cl;
+          compute_sphere_gradient(loc_bounds, loc_sphere, cl, eta);
+
+          max_dist += cl.w;
+
+          if (transform_back)
+          {
+            inv_transform_vec_quat(obb_pos, obb_quat, cl, max_grad);
+
+            // inv_transform_vec_quat_add(&obb_mat[(start_box_idx + box_idx) * 7], cl,
+            //                           max_grad);
+          }
+        }
+      }
+
+      // sparsity opt:
+      if (max_dist == 0)
+      {
+        if (sparsity_idx[bn_sph_idx] == 0)
+        {
+          return;
+        }
+        sparsity_idx[bn_sph_idx] = 0;
+
+        if (transform_back)
+        {
+          *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; // max_grad is all zeros
+        }
+        out_distance[bn_sph_idx] = 0.0;
+        return;
+      }
+
+      // else max_dist != 0
+      max_dist = weight[0] * max_dist;
+
+      if (transform_back)
+      {
+        *(float3 *)&closest_pt[bn_sph_idx * 4] = weight[0] * max_grad;
+      }
+      out_distance[bn_sph_idx] = max_dist;
+      sparsity_idx[bn_sph_idx] = 1;
+    }
+
+    template<typename scalar_t, bool enable_speed_metric>
+    __device__ __forceinline__ void swept_sphere_obb_distance_fn(
+      const scalar_t *sphere_position,
+      const int env_idx, const int b_idx,
+      const int h_idx, const int sph_idx,
+      scalar_t *out_distance,
+      scalar_t *closest_pt,
+      uint8_t *sparsity_idx,
+      const scalar_t *weight,
+      const scalar_t *activation_distance, const scalar_t *speed_dt,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_mat,
+      const uint8_t *obb_enable,
+      const int max_nobs,
+      const int nboxes, const int batch_size, const int horizon,
+      const int nspheres, const int sweep_steps,
+      const bool transform_back)
+    {
+      // create shared memory to do warp wide reductions:
+      // warp wide reductions should only happen across nspheres in same batch and horizon
+      //
+      // extern __shared__ float psum[];
+
+      const int   sw_steps      = sweep_steps;
+      const float fl_sw_steps   = sw_steps;
+      const int   start_box_idx = max_nobs * env_idx;
+      const int   b_addrs       =
+        b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
+
+      // We read the same obstacles across
+
+      // Load sphere_position input
+      // if h_idx == horizon -1, we just read the same index
+      const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;
+
+      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];
+
+
+      if (sphere_1_cache.w <= 0.0)
+      {
+        // write zeros for cost:
+        out_distance[bhs_idx] = 0;
+
+        // write zeros for gradient if not zero:
+        if (sparsity_idx[bhs_idx] != 0)
+        {
+          sparsity_idx[b_addrs + h_idx * nspheres + sph_idx] = 0;
+          *(float4 *)&closest_pt[bhs_idx * 4]                = make_float4(0.0);
+        }
+
+        return;
+      }
+      bool  sweep_back = false;
+      bool  sweep_fwd = false;
+      float sphere_0_distance, sphere_2_distance, sphere_0_len, sphere_2_len;
+
+      float  max_dist = 0.0;
+      float3 max_grad = make_float3(0.0, 0.0, 0.0);
+
+      const float dt  = speed_dt[0];
+      const float eta = activation_distance[0];
+      sphere_1_cache.w += eta;
+      float4 sphere_0_cache, sphere_2_cache;
+
+      if (h_idx > 0)
+      {
+        sphere_0_cache =
+          *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 + sph_idx * 4];
+        sphere_0_cache.w  = sphere_1_cache.w;
+        sphere_0_distance = sphere_distance(sphere_0_cache, sphere_1_cache);
+        sphere_0_len      = sphere_0_distance + sphere_0_cache.w * 2;
+
+        if (sphere_0_distance > 0.0)
+        {
+          sweep_back = true;
+        }
+      }
+
+      if (h_idx < horizon - 1)
+      {
+        sphere_2_cache =
+          *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 + sph_idx * 4];
+        sphere_2_cache.w  = sphere_1_cache.w;
+        sphere_2_distance = sphere_distance(sphere_2_cache, sphere_1_cache);
+        sphere_2_len      = sphere_2_distance + sphere_2_cache.w * 2;
+
+        if (sphere_2_distance > 0.0)
+        {
+          sweep_fwd = true;
+        }
+      }
+      float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
+      float  k0 = 0.0;
+
+
+      // float4 loc_sphere = make_float4(0.0);
+      float4 obb_quat   = make_float4(0.0);
+      float3 obb_pos    = make_float3(0.0);
+      float3 loc_bounds = make_float3(0.0);
+
+      for (int box_idx = 0; box_idx < nboxes; box_idx++)
+      {
+        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+        {
+          continue;
+        }
+
+        // read position and quaternion:
+        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
+                      obb_quat);
+        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);
+        float curr_jump_distance = 0.0;
+
+        const float3 grad_loc_bounds = loc_bounds + sphere_1_cache.w; // assuming sphere radius
+                                                                      // doesn't change
+
+        transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere_1);
+
+
+        // assuming sphere position is in box frame:
+        // read data:
+        float4 sum_pt = make_float4(0.0, 0.0, 0.0, 0.0);
+
+        // check at exact timestep:
+        if (check_sphere_aabb(loc_bounds, loc_sphere_1))
+        {
+          compute_sphere_gradient(grad_loc_bounds, loc_sphere_1, sum_pt, eta);
+          curr_jump_distance = loc_sphere_1.w; // TODO: check if this is better
+        }
+        else if (sweep_back || sweep_fwd)
+        {
+          // there is no collision, compute the distance to obstacle:
+          curr_jump_distance = compute_distance(grad_loc_bounds, loc_sphere_1) -
+                               loc_sphere_1.w; // - eta;
+        }
+        const float jump_mid_distance = curr_jump_distance;
+
+        // compute distance between sweep spheres:
+        if (sweep_back && (jump_mid_distance < sphere_0_distance / 2))
+        {
+          transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0);
+
+          // get unit vector:
+          // loc_sphere_0 = (loc_sphere_0 - loc_sphere_1)/(sphere_0_len);
+
+          // loop over sweep steps and accumulate distance:
+          for (int j = 0; j < sw_steps; j++)
+          {
+            // jump by current jump distance:
+
+            // when sweep_steps == 0, then we only check at loc_sphere_1.
+            // do interpolation from t=1 to t=0 (sweep backward)
+
+            if (curr_jump_distance >= (sphere_0_len / 2))
+            {
+              break;
+            }
+            k0 = 1 - (curr_jump_distance / sphere_0_len);
+            check_jump_distance(loc_sphere_1, loc_sphere_0, k0, eta, loc_bounds,
+                                grad_loc_bounds, sum_pt, curr_jump_distance);
+          }
+        }
+
+        if (sweep_fwd && (jump_mid_distance < (sphere_2_len / 2)))
+        {
+          curr_jump_distance = jump_mid_distance;
+          transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2);
+
+
+          for (int j = 0; j < sw_steps; j++)
+          {
+            if (curr_jump_distance >= (sphere_2_len / 2))
+            {
+              break;
+            }
+            k0 = 1 - curr_jump_distance / sphere_2_len;
+            check_jump_distance(loc_sphere_1, loc_sphere_2, k0, eta, loc_bounds,
+                                grad_loc_bounds, sum_pt, curr_jump_distance);
+          }
+        }
+
+        if (sum_pt.w > 0)
+        {
+          max_dist += sum_pt.w;
+
+          // transform point back if required:
+          if (transform_back)
+          {
+            inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad);
+          }
+
+          // break;// break after first obstacle collision
+        }
+      }
+
+
+      // sparsity opt:
+      if (max_dist == 0)
+      {
+        if (sparsity_idx[bhs_idx] == 0)
+        {
+          return;
+        }
+        sparsity_idx[bhs_idx] = 0;
+
+        if (transform_back)
+        {
+          *(float3 *)&closest_pt[bhs_idx * 4] = max_grad; // max_grad is all zeros
+        }
+        out_distance[bhs_idx] = 0.0;
+        return;
+      }
+
+      // computer speed metric here:
+      if (enable_speed_metric)
+      {
+        if (sweep_back && sweep_fwd)
+        {
+          scale_speed_metric(sphere_0_cache, sphere_1_cache, sphere_2_cache, dt,
+                             transform_back, max_dist, max_grad);
+        }
+      }
+      max_dist = weight[0] * max_dist;
+
+      if (transform_back)
+      {
+        *(float3 *)&closest_pt[bhs_idx * 4] = weight[0] * max_grad;
+      }
+      sparsity_idx[bhs_idx] = 1;
+
+      out_distance[bhs_idx] = max_dist;
+    }
+
+    /**
+     * @brief Swept Collision checking. Note: This function currently does not
+     * implement skipping computation based on distance (which is done in
+     * swept_sphere_obb_distance_fn).
+     *
+     * @tparam scalar_t
+     * @param sphere_position
+     * @param env_idx
+     * @param b_idx
+     * @param h_idx
+     * @param sph_idx
+     * @param out_distance
+     * @param weight
+     * @param activation_distance
+     * @param obb_accel
+     * @param obb_bounds
+     * @param obb_mat
+     * @param obb_enable
+     * @param max_nobs
+     * @param nboxes
+     * @param batch_size
+     * @param horizon
+     * @param nspheres
+     * @param sweep_steps
+     * @return __device__
+     */
+    template<typename scalar_t>
+    __device__ __forceinline__ void swept_sphere_obb_collision_fn(
+      const scalar_t *sphere_position,
+      const int env_idx, const int b_idx,
+      const int h_idx, const int sph_idx, scalar_t *out_distance,
+      const scalar_t *weight, const scalar_t *activation_distance,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs,
+      const int nboxes, const int batch_size, const int horizon,
+      const int nspheres, const int sweep_steps)
+    {
+      const int   sw_steps    = sweep_steps;
+      const float fl_sw_steps = 2 * sw_steps + 1;
+      float max_dist          = 0.0;
+      const float eta         = activation_distance[0];
+      const int   b_addrs     =
+        b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
+      const int start_box_idx = max_nobs * env_idx;
+      const int bhs_idx       = b_addrs + h_idx * nspheres + sph_idx;
+
+      // We read the same obstacles across
+
+      // Load sphere_position input
+      // if h_idx == horizon -1, we just read the same index
+      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];
+
+      if (sphere_1_cache.w <= 0.0)
+      {
+        out_distance[b_addrs + h_idx * nspheres + sph_idx] = 0.0;
+        return;
+      }
+      sphere_1_cache.w += eta;
+
+      float4 sphere_0_cache, sphere_2_cache;
+
+      if (h_idx > 0)
+      {
+        sphere_0_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 +
+                                                     sph_idx * 4];
+        sphere_0_cache.w += eta;
+      }
+
+      if (h_idx < horizon - 1)
+      {
+        sphere_2_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 +
+                                                     sph_idx * 4];
+        sphere_2_cache.w += eta;
+      }
+      float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
+      float4 interpolated_sphere;
+      float  k0, k1;
+      float  in_obb_mat[7];
+
+      for (int box_idx = 0; box_idx < nboxes; box_idx++)
+      {
+        // read position and quaternion:
+        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+        {
+          continue;
+        }
+
 #pragma unroll
-    for (int i = 0; i < 7; i++) {
-      in_obb_mat[i] = obb_mat[(start_box_idx + box_idx) * 7 + i];
-    }
 
-    float3 loc_bounds =
-        *(float3 *)&obb_bounds[(start_box_idx + box_idx) * 3]; // /2
-    loc_bounds = loc_bounds / 2;
+        for (int i = 0; i < 7; i++)
+        {
+          in_obb_mat[i] = obb_mat[(start_box_idx + box_idx) * 7 + i];
+        }
 
-    transform_sphere_quat(&in_obb_mat[0], sphere_1_cache, loc_sphere_1);
+        float3 loc_bounds =
+          *(float3 *)&obb_bounds[(start_box_idx + box_idx) * 3]; // /2
+        loc_bounds = loc_bounds / 2;
 
-    max_dist += box_idx;
-    if (check_sphere_aabb(loc_bounds, loc_sphere_1)) {
-      max_dist = 1;
-    }
-    if (h_idx > 0) {
+        transform_sphere_quat(&in_obb_mat[0], sphere_1_cache, loc_sphere_1);
 
-      transform_sphere_quat(&in_obb_mat[0], sphere_0_cache, loc_sphere_0);
+        max_dist += box_idx;
 
-      // loop over sweep steps and accumulate distance:
-      for (int j = 0; j < sw_steps; j++) {
-        // when sweep_steps == 0, then we only check at loc_sphere_1.
-        // do interpolation from t=1 to t=0 (sweep backward)
-        k0 = (j + 1) / (fl_sw_steps);
-        k1 = 1 - k0;
-        interpolated_sphere = k0 * loc_sphere_1 + (k1)*loc_sphere_0;
-        if (check_sphere_aabb(loc_bounds, interpolated_sphere)) {
+        if (check_sphere_aabb(loc_bounds, loc_sphere_1))
+        {
           max_dist = 1;
+        }
+
+        if (h_idx > 0)
+        {
+          transform_sphere_quat(&in_obb_mat[0], sphere_0_cache, loc_sphere_0);
+
+          // loop over sweep steps and accumulate distance:
+          for (int j = 0; j < sw_steps; j++)
+          {
+            // when sweep_steps == 0, then we only check at loc_sphere_1.
+            // do interpolation from t=1 to t=0 (sweep backward)
+            k0                  = (j + 1) / (fl_sw_steps);
+            k1                  = 1 - k0;
+            interpolated_sphere = k0 * loc_sphere_1 + (k1) * loc_sphere_0;
+
+            if (check_sphere_aabb(loc_bounds, interpolated_sphere))
+            {
+              max_dist = 1;
+              break;
+            }
+          }
+        }
+
+        if (h_idx < horizon - 1)
+        {
+          transform_sphere_quat(&in_obb_mat[0], sphere_2_cache, loc_sphere_2);
+
+          for (int j = 0; j < sw_steps; j++)
+          {
+            // do interpolation from t=1 to t=2 (sweep forward):
+
+            k0                  = (j + 1) / (fl_sw_steps);
+            k1                  = 1 - k0;
+            interpolated_sphere = k0 * loc_sphere_1 + (k1) * loc_sphere_2;
+
+            if (check_sphere_aabb(loc_bounds, interpolated_sphere))
+            {
+              max_dist = 1;
+              break;
+            }
+          }
+        }
+
+        if (max_dist > 0)
+        {
           break;
         }
       }
+      out_distance[b_addrs + h_idx * nspheres + sph_idx] = weight[0] * max_dist;
     }
 
-    if (h_idx < horizon - 1) {
+    template<typename scalar_t, bool batch_env_t>
+    __global__ void sphere_obb_distance_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
+      const scalar_t *activation_distance, const scalar_t *obb_accel,
+      const scalar_t *obb_bounds, const scalar_t *obb_mat,
+      const uint8_t *obb_enable, const int32_t *n_env_obb,
+      const int32_t *env_query_idx,
+      const int max_nobs,
+      const int batch_size, const int horizon, const int nspheres,
+      const bool transform_back)
+    {
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      // const int sph_idx = blockIdx.x * blockDim.x + threadIdx.x;
+      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx   = t_idx / (horizon * nspheres);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
 
-      transform_sphere_quat(&in_obb_mat[0], sphere_2_cache, loc_sphere_2);
-
-      for (int j = 0; j < sw_steps; j++) {
-        // do interpolation from t=1 to t=2 (sweep forward):
-
-        k0 = (j + 1) / (fl_sw_steps);
-        k1 = 1 - k0;
-        interpolated_sphere = k0 * loc_sphere_1 + (k1)*loc_sphere_2;
-        if (check_sphere_aabb(loc_bounds, interpolated_sphere)) {
-          max_dist = 1;
-          break;
-        }
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
       }
-    }
-    if (max_dist > 0) {
-      break;
-    }
-  }
-  out_distance[b_addrs + h_idx * nspheres + sph_idx] = weight[0] * max_dist;
-}
-
-template <typename scalar_t, bool batch_env_t>
-__global__ void sphere_obb_distance_kernel(
-    const scalar_t *sphere_position, 
-    scalar_t *out_distance,
-    scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
-    const scalar_t *activation_distance, const scalar_t *obb_accel,
-    const scalar_t *obb_bounds, const scalar_t *obb_mat,
-    const uint8_t *obb_enable, const int32_t *n_env_obb,
-    const int32_t *env_query_idx,
-     const int max_nobs,
-    const int batch_size, const int horizon, const int nspheres,
-    const bool transform_back) {
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  // const int sph_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
-  const int bn_sph_idx =
+      const int bn_sph_idx =
         b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
 
-  int env_idx=0;
-  int env_nboxes=n_env_obb[0];
-  if(batch_env_t)
-  {
-    
-    env_idx =
-      env_query_idx[b_idx]; // read env idx from current batch idx
-    env_nboxes =
-      n_env_obb[env_idx]; // read nboxes in current environment
-  }
- 
-  sphere_obb_distance_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
-                         closest_pt, sparsity_idx, weight, activation_distance,
-                         obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs,
-                         env_nboxes, transform_back);
-  // return the sphere distance here:
-  // sync threads and do block level reduction:
+      int env_idx    = 0;
+      int env_nboxes = n_env_obb[0];
+
+      if (batch_env_t)
+      {
+        env_idx =
+          env_query_idx[b_idx]; // read env idx from current batch idx
+        env_nboxes =
+          n_env_obb[env_idx];   // read nboxes in current environment
+      }
+
+      sphere_obb_distance_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
+                             closest_pt, sparsity_idx, weight, activation_distance,
+                             obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs,
+                             env_nboxes, transform_back);
+
+      // return the sphere distance here:
+      // sync threads and do block level reduction:
+    }
+
+    template<typename scalar_t, bool enable_speed_metric>
+    __global__ void swept_sphere_obb_distance_jump_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
+      const scalar_t *activation_distance, const scalar_t *speed_dt,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_pose, const uint8_t *obb_enable,
+      const int32_t *n_env_obb, const int max_nobs, const int batch_size,
+      const int horizon, const int nspheres, const int sweep_steps, const bool transform_back)
+    {
+      // This kernel jumps by sdf to only get gradients at collision points.
+
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx   = t_idx / (horizon * nspheres);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
+
+      const int env_idx = 0;
 
 
-}
+      const int env_nboxes = n_env_obb[env_idx];
+      swept_sphere_obb_distance_fn<scalar_t, enable_speed_metric>(
+        sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
+        sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
+        obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
+        horizon, nspheres, sweep_steps, transform_back);
+    }
 
-template <typename scalar_t, bool enable_speed_metric>
-__global__ void swept_sphere_obb_distance_jump_kernel(
-    const scalar_t *sphere_position, 
-    scalar_t *out_distance,
-    scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
-    const scalar_t *activation_distance, const scalar_t *speed_dt,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_pose, const uint8_t *obb_enable,
-    const int32_t *n_env_obb, const int max_nobs, const int batch_size,
-    const int horizon, const int nspheres, const int sweep_steps, const bool transform_back) {
+    template<typename scalar_t, bool enable_speed_metric>
+    __global__ void swept_sphere_obb_distance_jump_batch_env_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
+      const scalar_t *activation_distance, const scalar_t *speed_dt,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_pose, const uint8_t *obb_enable,
+      const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
+      const int batch_size, const int horizon, const int nspheres,
+      const int sweep_steps,
+      const bool transform_back)
+    {
+      // This kernel jumps by sdf to only get gradients at collision points.
 
-  // This kernel jumps by sdf to only get gradients at collision points.
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx = t_idx / (horizon * nspheres);
 
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
+      // const int sph_idx = (t_idx - b_idx * (horizon * nspheres)) / horizon;
+      // const int h_idx = (t_idx - b_idx * horizon * nspheres - sph_idx * horizon);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
 
-  const int env_idx = 0;
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
 
- 
-  const int env_nboxes = n_env_obb[env_idx];
-  swept_sphere_obb_distance_fn<scalar_t, enable_speed_metric>(
-      sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
-      sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
-      obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
-      horizon, nspheres, sweep_steps, transform_back);
-}
+      const int env_idx    = env_query_idx[b_idx];
+      const int env_nboxes = n_env_obb[env_idx];
 
-template <typename scalar_t, bool enable_speed_metric>
-__global__ void swept_sphere_obb_distance_jump_batch_env_kernel(
-    const scalar_t *sphere_position,
-    scalar_t *out_distance,
-    scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
-    const scalar_t *activation_distance, const scalar_t *speed_dt,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_pose, const uint8_t *obb_enable,
-    const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
-    const int batch_size, const int horizon, const int nspheres,
-    const int sweep_steps,
-    const bool transform_back) {
-  // This kernel jumps by sdf to only get gradients at collision points.
+      swept_sphere_obb_distance_fn<scalar_t, enable_speed_metric>(
+        sphere_position,  env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
+        sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
+        obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
+        horizon, nspheres, sweep_steps, transform_back);
+    }
 
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  //const int sph_idx = (t_idx - b_idx * (horizon * nspheres)) / horizon;
-  //const int h_idx = (t_idx - b_idx * horizon * nspheres - sph_idx * horizon);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
+    template<typename scalar_t>
+    __global__ void swept_sphere_obb_collision_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      const scalar_t *weight, const scalar_t *activation_distance,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_pose, const uint8_t *obb_enable,
+      const int32_t *n_env_obb, const int max_nobs, const int batch_size,
+      const int horizon, const int nspheres, const int sweep_steps)
+    {
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx   = t_idx / (horizon * nspheres);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
 
-  const int env_idx = env_query_idx[b_idx];
-  const int env_nboxes = n_env_obb[env_idx];
-  
-  swept_sphere_obb_distance_fn<scalar_t, enable_speed_metric>(
-      sphere_position,  env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
-      sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
-      obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
-      horizon, nspheres, sweep_steps, transform_back);
-}
-template <typename scalar_t>
-__global__ void swept_sphere_obb_collision_kernel(
-    const scalar_t *sphere_position, 
-    scalar_t *out_distance,
-    const scalar_t *weight, const scalar_t *activation_distance,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_pose, const uint8_t *obb_enable,
-    const int32_t *n_env_obb, const int max_nobs, const int batch_size,
-    const int horizon, const int nspheres, const int sweep_steps) {
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
 
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
+      const int env_idx    = 0;
+      const int env_nboxes = n_env_obb[env_idx];
 
-  const int env_idx = 0;
-  const int env_nboxes = n_env_obb[env_idx];
+      swept_sphere_obb_collision_fn(
+        sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
+        activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+        max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
+    }
 
-  swept_sphere_obb_collision_fn(
-      sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
-      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
-      max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
-}
+    template<typename scalar_t>
+    __global__ void swept_sphere_obb_collision_batch_env_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      const scalar_t *weight, const scalar_t *activation_distance,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_pose, const uint8_t *obb_enable,
+      const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
+      const int batch_size, const int horizon, const int nspheres,
+      const int sweep_steps)
+    {
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx   = t_idx / (horizon * nspheres);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
 
-template <typename scalar_t>
-__global__ void swept_sphere_obb_collision_batch_env_kernel(
-    const scalar_t *sphere_position, 
-    scalar_t *out_distance,
-    const scalar_t *weight, const scalar_t *activation_distance,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_pose, const uint8_t *obb_enable,
-    const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
-    const int batch_size, const int horizon, const int nspheres,
-    const int sweep_steps) {
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
 
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
+      const int env_idx    = env_query_idx[b_idx];
+      const int env_nboxes = n_env_obb[env_idx];
 
-  const int env_idx = env_query_idx[b_idx];
-  const int env_nboxes = n_env_obb[env_idx];
+      swept_sphere_obb_collision_fn(
+        sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
+        activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+        max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
+    }
 
-  swept_sphere_obb_collision_fn(
-      sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
-      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
-      max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
-}
+    template<typename scalar_t>
+    __global__ void sphere_obb_collision_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      const scalar_t *weight, const scalar_t *activation_distance,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_mat, const uint8_t *obb_enable,
+      const int32_t *n_env_obb, const int max_nobs, const int batch_size,
+      const int horizon, const int nspheres)
+    {
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx   = t_idx / (horizon * nspheres);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
 
-template <typename scalar_t>
-__global__ void sphere_obb_collision_kernel(
-    const scalar_t *sphere_position, 
-    scalar_t *out_distance,
-    const scalar_t *weight, const scalar_t *activation_distance,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_mat, const uint8_t *obb_enable,
-    const int32_t *n_env_obb, const int max_nobs, const int batch_size,
-    const int horizon, const int nspheres) {
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
-  const int env_idx = 0;
-  const int env_nboxes = n_env_obb[env_idx];
-  const int bn_sph_idx =
-      b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
-  sphere_obb_collision_fn(sphere_position,
-  env_idx, bn_sph_idx,sph_idx,  out_distance,
-                          weight, activation_distance, obb_accel, obb_bounds,
-                          obb_mat, obb_enable, max_nobs, env_nboxes);
-}
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
+      const int env_idx    = 0;
+      const int env_nboxes = n_env_obb[env_idx];
+      const int bn_sph_idx =
+        b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
+      sphere_obb_collision_fn(sphere_position,
+                              env_idx, bn_sph_idx, sph_idx,  out_distance,
+                              weight, activation_distance, obb_accel, obb_bounds,
+                              obb_mat, obb_enable, max_nobs, env_nboxes);
+    }
 
-template <typename scalar_t>
-__global__ void sphere_obb_collision_batch_env_kernel(
-    const scalar_t *sphere_position, 
-    scalar_t *out_distance,
-    const scalar_t *weight, const scalar_t *activation_distance,
-    const scalar_t *obb_accel, const scalar_t *obb_bounds,
-    const scalar_t *obb_mat, const uint8_t *obb_enable,
-    const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
-    const int batch_size, const int horizon, const int nspheres) {
-  // spheres_per_block is number of spheres in a thread
-  // compute local sphere batch by first dividing threadidx/nboxes
-  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int b_idx = t_idx / (horizon * nspheres);
-  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
-  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
-  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
-    return;
-  }
-  const int env_idx = env_query_idx[b_idx];
-  const int env_nboxes = n_env_obb[env_idx];
-  const int bn_sph_idx =
-      b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
-  sphere_obb_collision_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
-                          weight, activation_distance, obb_accel, obb_bounds,
-                          obb_mat, obb_enable, max_nobs, env_nboxes);
-}
-} // namespace Geometry
-} // namespace Curobo
+    template<typename scalar_t>
+    __global__ void sphere_obb_collision_batch_env_kernel(
+      const scalar_t *sphere_position,
+      scalar_t *out_distance,
+      const scalar_t *weight, const scalar_t *activation_distance,
+      const scalar_t *obb_accel, const scalar_t *obb_bounds,
+      const scalar_t *obb_mat, const uint8_t *obb_enable,
+      const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
+      const int batch_size, const int horizon, const int nspheres)
+    {
+      // spheres_per_block is number of spheres in a thread
+      // compute local sphere batch by first dividing threadidx/nboxes
+      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
+      const int b_idx   = t_idx / (horizon * nspheres);
+      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+
+      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
+      {
+        return;
+      }
+      const int env_idx    = env_query_idx[b_idx];
+      const int env_nboxes = n_env_obb[env_idx];
+      const int bn_sph_idx =
+        b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
+      sphere_obb_collision_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
+                              weight, activation_distance, obb_accel, obb_bounds,
+                              obb_mat, obb_enable, max_nobs, env_nboxes);
+    }
+  } // namespace Geometry
+}   // namespace Curobo
 
 
 std::vector<torch::Tensor>
 sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3
                 torch::Tensor distance,
-                torch::Tensor closest_point, // batch size, 3
+                torch::Tensor closest_point,         // batch size, 3
                 torch::Tensor sparsity_idx, const torch::Tensor weight,
                 const torch::Tensor activation_distance,
-                const torch::Tensor obb_accel,     // n_boxes, 4, 4
-                const torch::Tensor obb_bounds,    // n_boxes, 3
-                const torch::Tensor obb_pose,      // n_boxes, 4, 4
-                const torch::Tensor obb_enable,    // n_boxes, 4, 4
-                const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-                const torch::Tensor env_query_idx, // n_boxes, 4, 4
+                const torch::Tensor obb_accel,       // n_boxes, 4, 4
+                const torch::Tensor obb_bounds,      // n_boxes, 3
+                const torch::Tensor obb_pose,        // n_boxes, 4, 4
+                const torch::Tensor obb_enable,      // n_boxes, 4, 4
+                const torch::Tensor n_env_obb,       // n_boxes, 4, 4
+                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
                 const int max_nobs, const int batch_size, const int horizon,
                 const int n_spheres, const bool transform_back,
-                const bool compute_distance, const bool use_batch_env) {
-
+                const bool compute_distance, const bool use_batch_env)
+{
   using namespace Curobo::Geometry;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const int bnh_spheres = n_spheres * batch_size * horizon; //
+  cudaStream_t stream      = at::cuda::getCurrentCUDAStream();
+  const int    bnh_spheres = n_spheres * batch_size * horizon; //
 
   int threadsPerBlock = bnh_spheres;
-  if (threadsPerBlock > 128) {
-    threadsPerBlock = 128; 
+
+  if (threadsPerBlock > 128)
+  {
+    threadsPerBlock = 128;
   }
   int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
-  if (use_batch_env) {
-
-    if (compute_distance) {
 
+  if (use_batch_env)
+  {
+    if (compute_distance)
+    {
       // TODO: call kernel based on flag:
       AT_DISPATCH_FLOATING_TYPES(
-          distance.scalar_type(), "SphereObb_clpt", ([&] {
-            sphere_obb_distance_kernel<scalar_t, true>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(),
-                    closest_point.data_ptr<scalar_t>(),
-                    sparsity_idx.data_ptr<uint8_t>(),
-                    weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(),
-                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, transform_back);
-          }));
-    } else {
-
-      // TODO: call kernel based on flag:
-      AT_DISPATCH_FLOATING_TYPES(
-          distance.scalar_type(), "SphereObb_collision", ([&] {
-            sphere_obb_collision_batch_env_kernel<scalar_t>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(),
-                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres);
-          }));
+        distance.scalar_type(), "SphereObb_clpt", ([&] {
+        sphere_obb_distance_kernel<scalar_t, true>
+          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+          sphere_position.data_ptr<scalar_t>(),
+          distance.data_ptr<scalar_t>(),
+          closest_point.data_ptr<scalar_t>(),
+          sparsity_idx.data_ptr<uint8_t>(),
+          weight.data_ptr<scalar_t>(),
+          activation_distance.data_ptr<scalar_t>(),
+          obb_accel.data_ptr<scalar_t>(),
+          obb_bounds.data_ptr<scalar_t>(),
+          obb_pose.data_ptr<scalar_t>(),
+          obb_enable.data_ptr<uint8_t>(),
+          n_env_obb.data_ptr<int32_t>(),
+          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+          horizon, n_spheres, transform_back);
+      }));
     }
-
-  } else {
-
-    if (compute_distance) {
-
+    else
+    {
       // TODO: call kernel based on flag:
       AT_DISPATCH_FLOATING_TYPES(
-          distance.scalar_type(), "SphereObb_clpt", ([&] {
-            sphere_obb_distance_kernel<scalar_t, false>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(),
-                    closest_point.data_ptr<scalar_t>(),
-                    sparsity_idx.data_ptr<uint8_t>(),
-                    weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(),
-                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, transform_back);
-          }));
-    } else {
-
+        distance.scalar_type(), "SphereObb_collision", ([&] {
+        sphere_obb_collision_batch_env_kernel<scalar_t>
+          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+          sphere_position.data_ptr<scalar_t>(),
+          distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+          activation_distance.data_ptr<scalar_t>(),
+          obb_accel.data_ptr<scalar_t>(),
+          obb_bounds.data_ptr<scalar_t>(),
+          obb_pose.data_ptr<scalar_t>(),
+          obb_enable.data_ptr<uint8_t>(),
+          n_env_obb.data_ptr<int32_t>(),
+          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+          horizon, n_spheres);
+      }));
+    }
+  }
+  else
+  {
+    if (compute_distance)
+    {
       // TODO: call kernel based on flag:
       AT_DISPATCH_FLOATING_TYPES(
-          distance.scalar_type(), "SphereObb_collision", ([&] {
-            sphere_obb_collision_kernel<scalar_t>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres);
-          }));
+        distance.scalar_type(), "SphereObb_clpt", ([&] {
+        sphere_obb_distance_kernel<scalar_t, false>
+          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+          sphere_position.data_ptr<scalar_t>(),
+          distance.data_ptr<scalar_t>(),
+          closest_point.data_ptr<scalar_t>(),
+          sparsity_idx.data_ptr<uint8_t>(),
+          weight.data_ptr<scalar_t>(),
+          activation_distance.data_ptr<scalar_t>(),
+          obb_accel.data_ptr<scalar_t>(),
+          obb_bounds.data_ptr<scalar_t>(),
+          obb_pose.data_ptr<scalar_t>(),
+          obb_enable.data_ptr<uint8_t>(),
+          n_env_obb.data_ptr<int32_t>(),
+          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+          horizon, n_spheres, transform_back);
+      }));
+    }
+    else
+    {
+      // TODO: call kernel based on flag:
+      AT_DISPATCH_FLOATING_TYPES(
+        distance.scalar_type(), "SphereObb_collision", ([&] {
+        sphere_obb_collision_kernel<scalar_t>
+          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+          sphere_position.data_ptr<scalar_t>(),
+          distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+          activation_distance.data_ptr<scalar_t>(),
+          obb_accel.data_ptr<scalar_t>(),
+          obb_bounds.data_ptr<scalar_t>(),
+          obb_pose.data_ptr<scalar_t>(),
+          obb_enable.data_ptr<uint8_t>(),
+          n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+          horizon, n_spheres);
+      }));
     }
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {distance, closest_point, sparsity_idx};
+  return { distance, closest_point, sparsity_idx };
 }
 
-std::vector<torch::Tensor> swept_sphere_obb_clpt(
-    const torch::Tensor sphere_position, // batch_size, 3
+std::vector<torch::Tensor>swept_sphere_obb_clpt(
+  const torch::Tensor sphere_position, // batch_size, 3
 
-    torch::Tensor distance,                  // batch_size, 1
-    torch::Tensor
-        closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4,
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
-    const bool transform_back, const bool compute_distance,
-    const bool use_batch_env) {
+  torch::Tensor       distance,        // batch_size, 1
+  torch::Tensor
+  closest_point,                       // batch size, 4 -> written out as x,y,z,0 for gradient
+  torch::Tensor sparsity_idx, const torch::Tensor weight,
+  const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+  const torch::Tensor obb_accel,       // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,      // n_boxes, 3
+  const torch::Tensor obb_pose,        // n_boxes, 4, 4
+  const torch::Tensor obb_enable,      // n_boxes, 4,
+  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
+  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+  const int max_nobs, const int batch_size, const int horizon,
+  const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+  const bool transform_back, const bool compute_distance,
+  const bool use_batch_env)
+{
   using namespace Curobo::Geometry;
 
-  
-  //const int max_batches_per_block = 128;
+
+  // const int max_batches_per_block = 128;
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  
-  //const int bh = batch_size * horizon; 
 
-  //const int warp_n_spheres = n_spheres + (n_spheres % 32);// make n_spheres a multiple of 32
-  //int batches_per_block = (bh * warp_n_spheres) / max_batches_per_block; 
+  // const int bh = batch_size * horizon;
+
+  // const int warp_n_spheres = n_spheres + (n_spheres % 32);// make n_spheres a multiple of 32
+  // int batches_per_block = (bh * warp_n_spheres) / max_batches_per_block;
   const int bnh_spheres = n_spheres * batch_size * horizon; //
-  int threadsPerBlock = bnh_spheres;
+  int threadsPerBlock   = bnh_spheres;
 
   // This block is for old kernels?
-  if (threadsPerBlock > 128) {
+  if (threadsPerBlock > 128)
+  {
     threadsPerBlock = 128;
   }
   int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
-  
 
-  if (use_batch_env) {
-    if (compute_distance) {
 
+  if (use_batch_env)
+  {
+    if (compute_distance)
+    {
       if (enable_speed_metric)
       {
         // This is the best kernel for now
-      AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES(
           distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
-            swept_sphere_obb_distance_jump_batch_env_kernel<scalar_t, true>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(),
-                    closest_point.data_ptr<scalar_t>(),
-                    sparsity_idx.data_ptr<uint8_t>(),
-                    weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    speed_dt.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(),
-                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, sweep_steps, 
-                    transform_back);
-          }));
-
-
+          swept_sphere_obb_distance_jump_batch_env_kernel<scalar_t, true>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            sphere_position.data_ptr<scalar_t>(),
+            distance.data_ptr<scalar_t>(),
+            closest_point.data_ptr<scalar_t>(),
+            sparsity_idx.data_ptr<uint8_t>(),
+            weight.data_ptr<scalar_t>(),
+            activation_distance.data_ptr<scalar_t>(),
+            speed_dt.data_ptr<scalar_t>(),
+            obb_accel.data_ptr<scalar_t>(),
+            obb_bounds.data_ptr<scalar_t>(),
+            obb_pose.data_ptr<scalar_t>(),
+            obb_enable.data_ptr<uint8_t>(),
+            n_env_obb.data_ptr<int32_t>(),
+            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+            horizon, n_spheres, sweep_steps,
+            transform_back);
+        }));
       }
       else
       {
         // This is the best kernel for now
-      AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES(
           distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
-            swept_sphere_obb_distance_jump_batch_env_kernel<scalar_t, false>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(),
-                    closest_point.data_ptr<scalar_t>(),
-                    sparsity_idx.data_ptr<uint8_t>(),
-                    weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    speed_dt.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(),
-                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, sweep_steps, 
-                    transform_back);
-          }));
-
-
+          swept_sphere_obb_distance_jump_batch_env_kernel<scalar_t, false>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            sphere_position.data_ptr<scalar_t>(),
+            distance.data_ptr<scalar_t>(),
+            closest_point.data_ptr<scalar_t>(),
+            sparsity_idx.data_ptr<uint8_t>(),
+            weight.data_ptr<scalar_t>(),
+            activation_distance.data_ptr<scalar_t>(),
+            speed_dt.data_ptr<scalar_t>(),
+            obb_accel.data_ptr<scalar_t>(),
+            obb_bounds.data_ptr<scalar_t>(),
+            obb_pose.data_ptr<scalar_t>(),
+            obb_enable.data_ptr<uint8_t>(),
+            n_env_obb.data_ptr<int32_t>(),
+            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+            horizon, n_spheres, sweep_steps,
+            transform_back);
+        }));
       }
-
-          } else {
+    }
+    else
+    {
       // TODO: implement this later
 
       // TODO: call kernel based on flag:
       AT_DISPATCH_FLOATING_TYPES(
-          distance.scalar_type(), "SphereObb_collision", ([&] {
-            swept_sphere_obb_collision_batch_env_kernel<scalar_t>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(),
-                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, sweep_steps);
-          }));
+        distance.scalar_type(), "SphereObb_collision", ([&] {
+        swept_sphere_obb_collision_batch_env_kernel<scalar_t>
+          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+          sphere_position.data_ptr<scalar_t>(),
+          distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+          activation_distance.data_ptr<scalar_t>(),
+          obb_accel.data_ptr<scalar_t>(),
+          obb_bounds.data_ptr<scalar_t>(),
+          obb_pose.data_ptr<scalar_t>(),
+          obb_enable.data_ptr<uint8_t>(),
+          n_env_obb.data_ptr<int32_t>(),
+          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+          horizon, n_spheres, sweep_steps);
+      }));
     }
-
-  } else {
-    if (compute_distance) {
-
+  }
+  else
+  {
+    if (compute_distance)
+    {
       if (enable_speed_metric)
       {
-
-      
-      // This is the best kernel for now
-      AT_DISPATCH_FLOATING_TYPES(
+        // This is the best kernel for now
+        AT_DISPATCH_FLOATING_TYPES(
           distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
-            swept_sphere_obb_distance_jump_kernel<scalar_t, true>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(),
-                    closest_point.data_ptr<scalar_t>(),
-                    sparsity_idx.data_ptr<uint8_t>(),
-                    weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    speed_dt.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, sweep_steps,
-                    transform_back);
-          }));
+          swept_sphere_obb_distance_jump_kernel<scalar_t, true>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            sphere_position.data_ptr<scalar_t>(),
+            distance.data_ptr<scalar_t>(),
+            closest_point.data_ptr<scalar_t>(),
+            sparsity_idx.data_ptr<uint8_t>(),
+            weight.data_ptr<scalar_t>(),
+            activation_distance.data_ptr<scalar_t>(),
+            speed_dt.data_ptr<scalar_t>(),
+            obb_accel.data_ptr<scalar_t>(),
+            obb_bounds.data_ptr<scalar_t>(),
+            obb_pose.data_ptr<scalar_t>(),
+            obb_enable.data_ptr<uint8_t>(),
+            n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+            horizon, n_spheres, sweep_steps,
+            transform_back);
+        }));
       }
       else
       {
         // This is the best kernel for now
-      AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES(
           distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
-            swept_sphere_obb_distance_jump_kernel<scalar_t, false>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(),
-                    closest_point.data_ptr<scalar_t>(),
-                    sparsity_idx.data_ptr<uint8_t>(),
-                    weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    speed_dt.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, sweep_steps,
-                    transform_back);
-          }));
-      
+          swept_sphere_obb_distance_jump_kernel<scalar_t, false>
+            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+            sphere_position.data_ptr<scalar_t>(),
+            distance.data_ptr<scalar_t>(),
+            closest_point.data_ptr<scalar_t>(),
+            sparsity_idx.data_ptr<uint8_t>(),
+            weight.data_ptr<scalar_t>(),
+            activation_distance.data_ptr<scalar_t>(),
+            speed_dt.data_ptr<scalar_t>(),
+            obb_accel.data_ptr<scalar_t>(),
+            obb_bounds.data_ptr<scalar_t>(),
+            obb_pose.data_ptr<scalar_t>(),
+            obb_enable.data_ptr<uint8_t>(),
+            n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+            horizon, n_spheres, sweep_steps,
+            transform_back);
+        }));
       }
-    } else {
+    }
+    else
+    {
       AT_DISPATCH_FLOATING_TYPES(
-          distance.scalar_type(), "SphereObb_collision", ([&] {
-            swept_sphere_obb_collision_kernel<scalar_t>
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                    sphere_position.data_ptr<scalar_t>(),
-                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                    activation_distance.data_ptr<scalar_t>(),
-                    obb_accel.data_ptr<scalar_t>(),
-                    obb_bounds.data_ptr<scalar_t>(),
-                    obb_pose.data_ptr<scalar_t>(),
-                    obb_enable.data_ptr<uint8_t>(),
-                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
-                    horizon, n_spheres, sweep_steps);
-          }));
+        distance.scalar_type(), "SphereObb_collision", ([&] {
+        swept_sphere_obb_collision_kernel<scalar_t>
+          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+          sphere_position.data_ptr<scalar_t>(),
+          distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+          activation_distance.data_ptr<scalar_t>(),
+          obb_accel.data_ptr<scalar_t>(),
+          obb_bounds.data_ptr<scalar_t>(),
+          obb_pose.data_ptr<scalar_t>(),
+          obb_enable.data_ptr<uint8_t>(),
+          n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+          horizon, n_spheres, sweep_steps);
+      }));
     }
   }
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {distance, closest_point, sparsity_idx}; //, debug_data};
+  return { distance, closest_point, sparsity_idx }; // , debug_data};
 }
diff --git a/src/curobo/curobolib/cpp/tensor_step_cuda.cpp b/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
index 1cebd96..612fcfe 100644
--- a/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
@@ -13,72 +13,120 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <vector>
 
-std::vector<torch::Tensor> step_position_clique(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof);
-std::vector<torch::Tensor> step_position_clique2(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof, const int mode);
-std::vector<torch::Tensor> step_position_clique2_idx(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,  const int mode);
+std::vector<torch::Tensor>step_position_clique(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_position,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof);
+std::vector<torch::Tensor>step_position_clique2(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_position,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const int           mode);
+std::vector<torch::Tensor>step_position_clique2_idx(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_position,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const int           mode);
 
-std::vector<torch::Tensor> backward_step_position_clique(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof);
-std::vector<torch::Tensor> backward_step_position_clique2(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,  const int mode);
+std::vector<torch::Tensor>backward_step_position_clique(
+  torch::Tensor       out_grad_position,
+  const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity,
+  const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof);
+std::vector<torch::Tensor>backward_step_position_clique2(
+  torch::Tensor       out_grad_position,
+  const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity,
+  const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const int           mode);
 
 std::vector<torch::Tensor>
-step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
-                  torch::Tensor out_acceleration, torch::Tensor out_jerk,
-                  const torch::Tensor u_acc, const torch::Tensor start_position,
+step_acceleration(torch::Tensor       out_position,
+                  torch::Tensor       out_velocity,
+                  torch::Tensor       out_acceleration,
+                  torch::Tensor       out_jerk,
+                  const torch::Tensor u_acc,
+                  const torch::Tensor start_position,
                   const torch::Tensor start_velocity,
                   const torch::Tensor start_acceleration,
-                  const torch::Tensor traj_dt, const int batch_size,
-                  const int horizon, const int dof, const bool use_rk2 = true);
+                  const torch::Tensor traj_dt,
+                  const int           batch_size,
+                  const int           horizon,
+                  const int           dof,
+                  const bool          use_rk2 = true);
+
+std::vector<torch::Tensor>step_acceleration_idx(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_acc,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const bool          use_rk2 = true);
 
-std::vector<torch::Tensor> step_acceleration_idx(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const bool use_rk2 = true);
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
 
-std::vector<torch::Tensor> step_position_clique_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof) {
+std::vector<torch::Tensor>step_position_clique_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof)
+{
   const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
   assert(false); // not supported
   CHECK_INPUT(u_position);
   CHECK_INPUT(out_position);
@@ -96,14 +144,15 @@ std::vector<torch::Tensor> step_position_clique_wrapper(
                               batch_size, horizon, dof);
 }
 
-std::vector<torch::Tensor> step_position_clique2_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof,
-    const int mode) {
+std::vector<torch::Tensor>step_position_clique2_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof,
+  const int mode)
+{
   const at::cuda::OptionalCUDAGuard guard(u_position.device());
 
   CHECK_INPUT(u_position);
@@ -122,14 +171,15 @@ std::vector<torch::Tensor> step_position_clique2_wrapper(
                                batch_size, horizon, dof, mode);
 }
 
-std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const int mode) {
+std::vector<torch::Tensor>step_position_clique2_idx_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const int mode)
+{
   const at::cuda::OptionalCUDAGuard guard(u_position.device());
 
   CHECK_INPUT(u_position);
@@ -144,17 +194,19 @@ std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
   CHECK_INPUT(start_idx);
 
   return step_position_clique2_idx(
-      out_position, out_velocity, out_acceleration, out_jerk, u_position,
-      start_position, start_velocity, start_acceleration, start_idx, traj_dt,
-      batch_size, horizon, dof, mode);
+    out_position, out_velocity, out_acceleration, out_jerk, u_position,
+    start_position, start_velocity, start_acceleration, start_idx, traj_dt,
+    batch_size, horizon, dof, mode);
 }
 
-std::vector<torch::Tensor> backward_step_position_clique_wrapper(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof) {
+std::vector<torch::Tensor>backward_step_position_clique_wrapper(
+  torch::Tensor out_grad_position, const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof)
+{
   const at::cuda::OptionalCUDAGuard guard(grad_position.device());
+
   assert(false); // not supported
   CHECK_INPUT(out_grad_position);
   CHECK_INPUT(grad_position);
@@ -164,15 +216,17 @@ std::vector<torch::Tensor> backward_step_position_clique_wrapper(
   CHECK_INPUT(traj_dt);
 
   return backward_step_position_clique(
-      out_grad_position, grad_position, grad_velocity, grad_acceleration,
-      grad_jerk, traj_dt, batch_size, horizon, dof);
+    out_grad_position, grad_position, grad_velocity, grad_acceleration,
+    grad_jerk, traj_dt, batch_size, horizon, dof);
 }
-std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const int mode) {
+
+std::vector<torch::Tensor>backward_step_position_clique2_wrapper(
+  torch::Tensor out_grad_position, const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const int mode)
+{
   const at::cuda::OptionalCUDAGuard guard(grad_position.device());
 
   CHECK_INPUT(out_grad_position);
@@ -183,17 +237,18 @@ std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
   CHECK_INPUT(traj_dt);
 
   return backward_step_position_clique2(
-      out_grad_position, grad_position, grad_velocity, grad_acceleration,
-      grad_jerk, traj_dt, batch_size, horizon, dof, mode);
+    out_grad_position, grad_position, grad_velocity, grad_acceleration,
+    grad_jerk, traj_dt, batch_size, horizon, dof, mode);
 }
 
-std::vector<torch::Tensor> step_acceleration_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof, const bool use_rk2 = true) {
+std::vector<torch::Tensor>step_acceleration_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_acc, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof, const bool use_rk2 = true)
+{
   const at::cuda::OptionalCUDAGuard guard(u_acc.device());
 
   CHECK_INPUT(u_acc);
@@ -212,14 +267,15 @@ std::vector<torch::Tensor> step_acceleration_wrapper(
                            dof, use_rk2);
 }
 
-std::vector<torch::Tensor> step_acceleration_idx_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const bool use_rk2 = true) {
+std::vector<torch::Tensor>step_acceleration_idx_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_acc, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const bool use_rk2 = true)
+{
   const at::cuda::OptionalCUDAGuard guard(u_acc.device());
 
   CHECK_INPUT(u_acc);
@@ -239,19 +295,20 @@ std::vector<torch::Tensor> step_acceleration_idx_wrapper(
                                batch_size, horizon, dof, use_rk2);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("step_position", &step_position_clique_wrapper,
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("step_position",           &step_position_clique_wrapper,
         "Tensor Step Position (curobolib)");
-  m.def("step_position2", &step_position_clique2_wrapper,
+  m.def("step_position2",          &step_position_clique2_wrapper,
         "Tensor Step Position (curobolib)");
-  m.def("step_idx_position2", &step_position_clique2_idx_wrapper,
+  m.def("step_idx_position2",      &step_position_clique2_idx_wrapper,
         "Tensor Step Position (curobolib)");
-  m.def("step_position_backward", &backward_step_position_clique_wrapper,
+  m.def("step_position_backward",  &backward_step_position_clique_wrapper,
         "Tensor Step Position (curobolib)");
   m.def("step_position_backward2", &backward_step_position_clique2_wrapper,
         "Tensor Step Position (curobolib)");
-  m.def("step_acceleration", &step_acceleration_wrapper,
+  m.def("step_acceleration",       &step_acceleration_wrapper,
         "Tensor Step Acceleration (curobolib)");
-  m.def("step_acceleration_idx", &step_acceleration_idx_wrapper,
+  m.def("step_acceleration_idx",   &step_acceleration_idx_wrapper,
         "Tensor Step Acceleration (curobolib)");
-}
\ No newline at end of file
+}
diff --git a/src/curobo/curobolib/cpp/tensor_step_kernel.cu b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
index a0bd213..933053b 100644
--- a/src/curobo/curobolib/cpp/tensor_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
@@ -24,1479 +24,1576 @@
 
 namespace Curobo
 {
-namespace TensorStep
+  namespace TensorStep
+  {
+    template<typename scalar_t>
+    __global__ void position_clique_loop_kernel(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_position, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const scalar_t *traj_dt, const int batch_size, const int horizon,
+      const int dof)
+    {
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      const float dt =
+        traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+
+      // read start state:
+      float u_arr[MAX_H];
+      float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+      const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+
+      for (int i = 0; i < horizon; i++)
+      {
+        u_arr[i]    = u_position[b_addrs + i * dof + d_idx];
+        out_pos[i]  = 0;
+        out_vel[i]  = 0;
+        out_acc[i]  = 0;
+        out_jerk[i] = 0;
+      }
+
+      out_pos[0]  = start_position[b_idx * dof + d_idx];
+      out_vel[0]  = start_velocity[b_idx * dof + d_idx];
+      out_acc[0]  = start_acceleration[b_idx * dof + d_idx];
+      out_jerk[0] = 0.0;
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        // read actions: batch, horizon
+        out_pos[h_idx] = u_arr[h_idx - 1];
+        out_vel[h_idx] = (out_pos[h_idx] - out_pos[h_idx - 1]) * dt;  // 1 - 0
+        out_acc[h_idx] =
+          (out_vel[h_idx] - out_vel[h_idx - 1]) * dt;                 // 2 - 2.0 * 1 + 1
+        out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) * dt; // -1 3 -3 1
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_pos[h_idx]; // new_position;
+        out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_vel[h_idx];
+        out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_acc[h_idx];
+        out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+      }
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void compute_backward_difference(scalar_t       *out_position_mem,
+                                                                scalar_t       *out_velocity_mem,
+                                                                scalar_t       *out_acceleration_mem,
+                                                                scalar_t       *out_jerk_mem,
+                                                                const scalar_t *u_position,
+                                                                const scalar_t *start_position,
+                                                                const scalar_t *start_velocity,
+                                                                const scalar_t *start_acceleration,
+                                                                const scalar_t *traj_dt,
+                                                                const int       batch_size,
+                                                                const int       horizon,
+                                                                const int       dof,
+                                                                const int       b_idx,
+                                                                const int       h_idx,
+                                                                const int       d_idx,
+                                                                const int       b_offset)
+    {
+      const float dt =
+        traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+
+      // read start state:
+      float in_pos[4];
+      float st_pos = 0.0, st_vel = 0.0, st_acc = 0.0;
+
+  #pragma unroll 4
+
+      for (int i = 0; i < 4; i++)
+      {
+        in_pos[i] = 0.0;
+      }
+
+      float out_pos = 0.0, out_vel = 0.0, out_acc = 0.0, out_jerk = 0.0;
+      const int b_addrs = b_idx * horizon * dof;
+
+      if (h_idx < 4)
+      {
+        st_pos = start_position[b_offset * dof + d_idx];
+        st_vel = start_velocity[b_offset * dof + d_idx];
+        st_acc = start_acceleration[b_offset * dof + d_idx];
+      }
+
+      if (h_idx == 0)
+      {
+        out_pos = st_pos;
+        out_vel = st_vel;
+        out_acc = st_acc; //
+      }
+      else
+      {
+        if (h_idx == 1)
+        {
+          for (int i = 3; i < 4; i++)
+          {
+            in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i) * dof + d_idx];
+          }
+
+
+          in_pos[0] = st_pos - 2.0 * (st_vel - 0.5 * st_acc * dt) * dt;
+
+          in_pos[1] = st_pos - (st_vel - 0.5 * st_acc * dt) * dt;
+
+          in_pos[2] = st_pos;
+        }
+        else if (h_idx == 2)
+        {
+          for (int i = 2; i < 4; i++)
+          {
+            in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i) * dof + d_idx];
+          }
+
+          in_pos[0] = st_pos - (st_vel - 0.5 * st_acc * dt) * dt;
+
+          in_pos[1] = st_pos;
+        }
+        else if (h_idx == 3)
+        {
+          for (int i = 1; i < 4; i++)
+          {
+            in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i) * dof + d_idx];
+          }
+          in_pos[0] = st_pos;
+        }
+        else // h_idx >= 4
+
+        {
+          for (int i = 0; i < 4; i++)
+          {
+            in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i) * dof + d_idx];
+          }
+        }
+
+        out_pos  = in_pos[3];
+        out_vel  = (-in_pos[2] + in_pos[3]) * dt;
+        out_acc  = (in_pos[1] - 2 * in_pos[2] + in_pos[3]) * dt * dt;
+        out_jerk = (-in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
+      }
+
+
+      // write out:
+      out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_pos; // new_position;
+      out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx]     = out_vel;
+      out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
+      out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx]         = out_jerk;
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void compute_central_difference_v0(scalar_t       *out_position_mem,
+                                                                  scalar_t       *out_velocity_mem,
+                                                                  scalar_t       *out_acceleration_mem,
+                                                                  scalar_t       *out_jerk_mem,
+                                                                  const scalar_t *u_position,
+                                                                  const scalar_t *start_position,
+                                                                  const scalar_t *start_velocity,
+                                                                  const scalar_t *start_acceleration,
+                                                                  const scalar_t *traj_dt,
+                                                                  const int       batch_size,
+                                                                  const int       horizon,
+                                                                  const int       dof,
+                                                                  const int       b_idx,
+                                                                  const int       h_idx,
+                                                                  const int       d_idx,
+                                                                  const int       b_offset)
+    {
+      const float dt = traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+      // dt here is actually 1/dt;
+
+      // read start state:
+      float out_pos = 0.0, out_vel = 0.0, out_acc = 0.0, out_jerk = 0.0;
+      float st_pos = 0.0, st_vel = 0.0, st_acc = 0.0;
+
+      const int b_addrs = b_idx * horizon * dof;
+      float     in_pos[5]; // create a 5 value scalar
+
+  #pragma unroll 5
+
+      for (int i = 0; i < 5; i++)
+      {
+        in_pos[i] = 0.0;
+      }
+
+      if (h_idx < 4)
+      {
+        st_pos = start_position[b_offset * dof + d_idx];
+        st_vel = start_velocity[b_offset * dof + d_idx];
+        st_acc = start_acceleration[b_offset * dof + d_idx];
+      }
+
+      if (h_idx == 0)
+      {
+        out_pos = st_pos;
+        out_vel = st_vel;
+        out_acc = st_acc;
+      }
+      else if (h_idx < horizon - 2)
+      {
+        if (h_idx == 1)
+        {
+          in_pos[0] = st_pos - dt * (st_vel - (0.5 * st_acc * dt)); // start -1, start, u0, u1
+          in_pos[1] = st_pos;
+          in_pos[2] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+          in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+          in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+        }
+        else if (h_idx == 2)
+        {
+          in_pos[0] = start_position[b_offset * dof + d_idx];
+          in_pos[1] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
+          in_pos[2] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+          in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+          in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx]; \
+
+        }
+
+
+        else if (h_idx > 2)
+        {
+          in_pos[0] = u_position[b_addrs + (h_idx - 1 - 2) * dof + d_idx];
+          in_pos[1] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
+          in_pos[2] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+          in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+          in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+        }
+        out_pos  = in_pos[2];
+        out_vel  = (0.5 * in_pos[3] - 0.5 * in_pos[1]) * dt;
+        out_acc  = (in_pos[3] + in_pos[1] - 2 * in_pos[2]) * dt * dt;
+        out_jerk = ((-0.5) * in_pos[0] + in_pos[1]  - in_pos[3] + (0.5) * in_pos[4]) *
+                   (dt * dt * dt);
+      }
+      else if (h_idx == horizon - 2)
+      {
+        // use backward difference for jerk
+
+        in_pos[0] = u_position[b_addrs + (h_idx - 1 - 3) * dof + d_idx];
+        in_pos[1] = u_position[b_addrs + (h_idx - 1 - 2) * dof + d_idx];
+        in_pos[2] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
+        in_pos[3] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+        in_pos[4] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+
+        out_pos  = in_pos[3];
+        out_vel  = (0.5 * in_pos[4] - 0.5 * in_pos[2]) * dt;
+        out_acc  = (in_pos[4] + in_pos[2] - 2 * in_pos[3]) * dt * dt;
+        out_jerk = (-1 * in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
+      }
+      else if (h_idx == horizon - 1)
+      { // use backward difference for vel, acc, jerk
+        for (int i = 0; i < 4; i++)
+        {
+          in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i) * dof + d_idx];
+        }
+        out_pos  = in_pos[3];
+        out_vel  = (-in_pos[2] + in_pos[3]) * dt;
+        out_acc  = (in_pos[1] - 2 * in_pos[2] + in_pos[3]) * dt * dt;
+        out_jerk = (-in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
+      }
+
+      // write out:
+      out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx]     = out_pos;
+      out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx]     = out_vel;
+      out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
+      out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx]         = out_jerk;
+    }
+
+    template<typename scalar_t>
+    __device__ __forceinline__ void compute_central_difference(scalar_t       *out_position_mem,
+                                                               scalar_t       *out_velocity_mem,
+                                                               scalar_t       *out_acceleration_mem,
+                                                               scalar_t       *out_jerk_mem,
+                                                               const scalar_t *u_position,
+                                                               const scalar_t *start_position,
+                                                               const scalar_t *start_velocity,
+                                                               const scalar_t *start_acceleration,
+                                                               const scalar_t *traj_dt,
+                                                               const int       batch_size,
+                                                               const int       horizon,
+                                                               const int       dof,
+                                                               const int       b_idx,
+                                                               const int       h_idx,
+                                                               const int       d_idx,
+                                                               const int       b_offset)
+    {
+      const float dt = traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+      // dt here is actually 1/dt;
+      const float dt_inv  = 1.0 / dt;
+      const float st_jerk = 0.0;   // Note: start jerk can also be passed from global memory
+      // read start state:
+      float out_pos = 0.0, out_vel = 0.0, out_acc = 0.0, out_jerk = 0.0;
+      float st_pos = 0.0, st_vel = 0.0, st_acc = 0.0;
+
+      const int   b_addrs        = b_idx * horizon * dof;
+      const int   b_addrs_action = b_idx * (horizon - 4) * dof;
+      float       in_pos[5]; // create a 5 value scalar
+      const float acc_scale = 1.0;
+
+  #pragma unroll 5
+
+      for (int i = 0; i < 5; i++)
+      {
+        in_pos[i] = 0.0;
+      }
+
+      if (h_idx < 5)
+      {
+        st_pos = start_position[b_offset * dof + d_idx];
+        st_vel = start_velocity[b_offset * dof + d_idx];
+        st_acc = start_acceleration[b_offset * dof + d_idx];
+      }
+
+      if ((h_idx > 3) && (h_idx < horizon - 4))
+      {
+        in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
+        in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
+        in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
+        in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
+        in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
+      }
+
+
+      else if (h_idx == 0)
+      {
+        in_pos[0] = (3.0f / 2) *
+                    (-1 * st_acc * (dt_inv * dt_inv) - (dt_inv * dt_inv * dt_inv) * st_jerk) -
+                    3.0f * dt_inv *
+                    st_vel + st_pos;
+        in_pos[1] = -2.0f * st_acc * dt_inv * dt_inv - (4.0f / 3) * dt_inv * dt_inv * dt_inv *
+                    st_jerk - 2.0 * dt_inv * st_vel + st_pos;
+        in_pos[2] = -(3.0f / 2) * st_acc * dt_inv * dt_inv - (7.0f / 6) * dt_inv * dt_inv * dt_inv *
+                    st_jerk - dt_inv * st_vel + st_pos;
+        in_pos[3] = st_pos;
+        in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
+      }
+
+      else if (h_idx == 1)
+      {
+        in_pos[0] = -2.0f * st_acc * dt_inv * dt_inv - (4.0f / 3) * dt_inv * dt_inv * dt_inv *
+                    st_jerk - 2.0 * dt_inv * st_vel + st_pos;
+        in_pos[1] = -(3.0f / 2) * st_acc * dt_inv * dt_inv - (7.0f / 6) * dt_inv * dt_inv * dt_inv *
+                    st_jerk - dt_inv * st_vel + st_pos;
+
+
+        in_pos[2] = st_pos;
+        in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
+        in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
+      }
+
+      else if (h_idx == 2)
+      {
+        in_pos[0] = -(3.0f / 2) * st_acc * dt_inv * dt_inv - (7.0f / 6) * dt_inv * dt_inv * dt_inv *
+                    st_jerk - dt_inv * st_vel + st_pos;
+        in_pos[1] = st_pos;
+        in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
+        in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
+        in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
+      }
+      else if (h_idx == 3)
+      {
+        in_pos[0] = st_pos;
+        in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
+        in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
+        in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
+        in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
+      }
+
+      else if (h_idx == horizon - 4)
+      {
+        in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
+        in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
+        in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
+        in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
+        in_pos[4] = in_pos[3]; // in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof +
+                               // d_idx];
+      }
+
+      else if (h_idx == horizon - 3)
+      {
+        in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
+        in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
+        in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
+        in_pos[3] = in_pos[2]; // u_position[b_addrs_action + (h_idx - 1 + 1) * dof + d_idx];
+        in_pos[4] = in_pos[2]; // in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof +
+                               // d_idx];
+      }
+      else if (h_idx == horizon - 2)
+      {
+        in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
+        in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
+        in_pos[2] = in_pos[1];
+        in_pos[3] = in_pos[1]; // u_position[b_addrs_action + (h_idx - 1 + 1) * dof + d_idx];
+        in_pos[4] = in_pos[1]; // in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof +
+                               // d_idx];
+      }
+
+      else if (h_idx == horizon - 1)
+      {
+        in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
+        in_pos[1] = in_pos[0];
+        in_pos[2] = in_pos[0]; // u_position[b_addrs_action + (h_idx - 1 ) * dof + d_idx];
+        in_pos[3] = in_pos[0]; // u_position[b_addrs_action + (h_idx - 1 + 1) * dof + d_idx];
+        in_pos[4] = in_pos[0]; // in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof +
+                               // d_idx];
+      }
+      out_pos = in_pos[2];
+
+      // out_vel = (0.5 * in_pos[3] - 0.5 * in_pos[1]) * dt;
+      out_vel =
+        ((0.083333333f) * in_pos[0] - (0.666666667f) * in_pos[1] + (0.666666667f) * in_pos[3] +
+         (-0.083333333f) * in_pos[4]) * dt;
+
+      // out_acc = (in_pos[3] + in_pos[1] - 2.0 * in_pos[2]) * dt * dt;
+      out_acc =
+        ((-0.083333333f) * in_pos[0] + (1.333333333f) * in_pos[1] + (-2.5f) * in_pos[2] +
+         (1.333333333f) * in_pos[3] + (-0.083333333f) * in_pos[4]) * dt * dt;
+      out_jerk = ((-0.5f) * in_pos[0] + in_pos[1]  - in_pos[3] + (0.5f) * in_pos[4]) *
+                 (dt * dt * dt);
+
+      // write out:
+      out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx]     = out_pos;
+      out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx]     = out_vel;
+      out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
+      out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx]         = out_jerk;
+    }
+
+    template<typename scalar_t, int8_t mode>
+    __global__ void position_clique_loop_kernel2(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_position, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const scalar_t *traj_dt, const int batch_size, const int horizon,
+      const int dof)
+    {
+      const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+      // number of threads = batch_size * dof * horizon;
+      const int h_idx = tid % horizon;
+      const int d_idx = (tid / horizon) % dof;
+      const int b_idx = tid / (dof * horizon);
+
+      if (tid >= batch_size * dof * horizon)
+      {
+        return;
+      }
+
+
+      const int b_offset = b_idx;
+
+      if (mode == BWD_DIFF)
+      {
+        compute_backward_difference(out_position_mem,
+                                    out_velocity_mem,
+                                    out_acceleration_mem,
+                                    out_jerk_mem,
+                                    u_position,
+                                    start_position,
+                                    start_velocity,
+                                    start_acceleration,
+                                    traj_dt,
+                                    batch_size,
+                                    horizon,
+                                    dof,
+                                    b_idx,
+                                    h_idx,
+                                    d_idx,
+                                    b_offset);
+      }
+      else if (mode == CENTRAL_DIFF)
+      {
+        compute_central_difference(out_position_mem,
+                                   out_velocity_mem,
+                                   out_acceleration_mem,
+                                   out_jerk_mem,
+                                   u_position,
+                                   start_position,
+                                   start_velocity,
+                                   start_acceleration,
+                                   traj_dt,
+                                   batch_size,
+                                   horizon,
+                                   dof,
+                                   b_idx,
+                                   h_idx,
+                                   d_idx,
+                                   b_offset);
+      }
+      else
+      {
+        assert(false);
+      }
+    }
+
+    template<typename scalar_t, int8_t mode>
+    __global__ void position_clique_loop_idx_kernel2(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_position, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+      // number of threads = batch_size * dof * horizon;
+      const int h_idx = tid % horizon;
+      const int d_idx = (tid / horizon) % dof;
+      const int b_idx = tid / (dof * horizon);
+
+      if (tid >= batch_size * dof * horizon)
+      {
+        return;
+      }
+
+
+      const int b_offset = start_idx[b_idx];
+
+      if (mode == BWD_DIFF)
+      {
+        compute_backward_difference(out_position_mem,
+                                    out_velocity_mem,
+                                    out_acceleration_mem,
+                                    out_jerk_mem,
+                                    u_position,
+                                    start_position,
+                                    start_velocity,
+                                    start_acceleration,
+                                    traj_dt,
+                                    batch_size,
+                                    horizon,
+                                    dof,
+                                    b_idx,
+                                    h_idx,
+                                    d_idx,
+                                    b_offset);
+      }
+      else if (mode == CENTRAL_DIFF)
+      {
+        compute_central_difference(out_position_mem,
+                                   out_velocity_mem,
+                                   out_acceleration_mem,
+                                   out_jerk_mem,
+                                   u_position,
+                                   start_position,
+                                   start_velocity,
+                                   start_acceleration,
+                                   traj_dt,
+                                   batch_size,
+                                   horizon,
+                                   dof,
+                                   b_idx,
+                                   h_idx,
+                                   d_idx,
+                                   b_offset);
+      }
+      else
+      {
+        assert(false);
+      }
+    }
+
+    template<typename scalar_t>
+    __global__ void backward_position_clique_loop_kernel(
+      scalar_t *out_grad_position, const scalar_t *grad_position,
+      const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+      const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+      const int b_addrs = b_idx * horizon * dof;
+
+      // read gradients:
+      float g_pos[MAX_H];
+      float g_vel[MAX_H];
+      float g_acc[MAX_H];
+      float g_jerk[MAX_H];
+      const float dt   = traj_dt[0];
+      const float dt_2 = dt * dt;      // dt * dt;
+      const float dt_3 = dt * dt * dt; // dt * dt * dt;
+
+      // not used index == 0
+      g_pos[0]  = 0.0;
+      g_vel[0]  = 0.0;
+      g_acc[0]  = 0.0;
+      g_jerk[0] = 0.0;
+#pragma unroll
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        g_pos[h_idx]  = grad_position[b_addrs + (h_idx) * dof + d_idx];
+        g_vel[h_idx]  = grad_velocity[b_addrs + (h_idx) * dof + d_idx];
+        g_acc[h_idx]  = grad_acceleration[b_addrs + (h_idx) * dof + d_idx];
+        g_jerk[h_idx] = grad_jerk[b_addrs + (h_idx) * dof + d_idx];
+      }
+#pragma unroll
+
+      for (int i = 0; i < 4; i++)
+      {
+        g_vel[horizon + i]  = 0.0;
+        g_acc[horizon + i]  = 0.0;
+        g_jerk[horizon + i] = 0.0;
+      }
+
+      // compute gradient and sum
+      for (int h_idx = 0; h_idx < horizon - 1; h_idx++)
+      {
+        g_pos[h_idx + 1] +=
+          ((g_vel[h_idx + 1] - g_vel[h_idx + 2]) * dt +
+           (g_acc[h_idx + 1] - 2 * g_acc[h_idx + 2] + g_acc[h_idx + 3]) * dt_2 +
+           (g_jerk[h_idx + 1] - 3 * g_jerk[h_idx + 2] + 3 * g_jerk[h_idx + 3] -
+            g_jerk[h_idx + 4]) *
+           dt_3);
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon - 1; h_idx++)
+      {
+        out_grad_position[b_addrs + (h_idx) * dof + d_idx] = g_pos[h_idx + 1];
+      }
+      out_grad_position[b_addrs + (horizon - 1) * dof + d_idx] = 0.0;
+    }
+
+    template<typename scalar_t>
+    __global__ void backward_position_clique_loop_backward_difference_kernel2(
+      scalar_t *out_grad_position, const scalar_t *grad_position,
+      const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+      const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+      // number of threads = batch_size * dof * horizon;
+      const int h_idx = tid % horizon;
+      const int d_idx = (tid / horizon) % dof;
+      const int b_idx = tid / (dof * horizon);
+
+      if (tid >= batch_size * dof * horizon)
+      {
+        return;
+      }
+      const int b_addrs = b_idx * horizon * dof;
+
+      if (h_idx == 0)
+      {
+        return;
+      }
+
+
+      const float dt = traj_dt[0];
+
+      // read gradients:
+      float g_pos    = 0.0;
+      float out_grad = 0.0;
+      float g_vel[4];
+      float g_acc[4];
+      float g_jerk[4];
+
+  #pragma unroll 4
+
+      for (int i = 0; i < 4; i++)
+      {
+        g_vel[i]  = 0.0;
+        g_acc[i]  = 0.0;
+        g_jerk[i] = 0.0;
+      }
+
+      int hid = h_idx; // + 1;
+
+      g_pos = grad_position[b_addrs + (hid) * dof + d_idx];
+
+
+      if (hid < horizon - 3)
+      {
+        for (int i = 0; i < 4; i++)
+        {
+          g_vel[i]  = grad_velocity[b_addrs + (hid + i) * dof + d_idx];
+          g_acc[i]  = grad_acceleration[b_addrs + (hid + i) * dof + d_idx];
+          g_jerk[i] = grad_jerk[b_addrs + (hid + i) * dof + d_idx];
+        }
+      }
+      else if (hid == horizon - 3)
+      {
+        for (int i = 0; i < 3; i++)
+        {
+          g_vel[i]  = grad_velocity[b_addrs + (hid + i) * dof + d_idx];
+          g_acc[i]  = grad_acceleration[b_addrs + (hid + i) * dof + d_idx];
+          g_jerk[i] = grad_jerk[b_addrs + (hid + i) * dof + d_idx];
+        }
+      }
+
+
+      else if (hid == horizon - 2)
+      {
+        for (int i = 0; i < 2; i++)
+        {
+          g_vel[i]  = grad_velocity[b_addrs + (hid + i) * dof + d_idx];
+          g_acc[i]  = grad_acceleration[b_addrs + (hid + i) * dof + d_idx];
+          g_jerk[i] = grad_jerk[b_addrs + (hid + i) * dof + d_idx];
+        }
+      }
+      else if (hid == horizon - 1)
+      {
+        for (int i = 0; i < 1; i++)
+        {
+          g_vel[i]  = grad_velocity[b_addrs + (hid + i) * dof + d_idx];
+          g_acc[i]  = grad_acceleration[b_addrs + (hid + i) * dof + d_idx];
+          g_jerk[i] = grad_jerk[b_addrs + (hid + i) * dof + d_idx];
+        }
+      }
+
+
+      out_grad = (g_pos +
+                  (g_vel[0] -  g_vel[1]) * dt +
+                  (g_acc[0] - 2 * g_acc[1] + g_acc[2]) * dt * dt +
+                  (g_jerk[0] - 3 * g_jerk[1] + 3 * g_jerk[2] - g_jerk[3]) * dt * dt * dt);
+
+
+      // write out:
+      out_grad_position[b_addrs + (h_idx - 1) * dof + d_idx] = out_grad;
+    }
+
+    template<typename scalar_t>
+    __global__ void backward_position_clique_loop_central_difference_kernel2(
+      scalar_t *out_grad_position, const scalar_t *grad_position,
+      const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+      const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+      // number of threads = batch_size * dof * horizon;
+      const int h_idx = tid % horizon;
+      const int d_idx = (tid / horizon) % dof;
+      const int b_idx = tid / (dof * horizon);
+
+      if (tid >= batch_size * dof * horizon)
+      {
+        return;
+      }
+      const int b_addrs        = b_idx * horizon * dof;
+      const int b_addrs_action = b_idx * (horizon - 4) * dof;
+
+      if ((h_idx < 2) || (h_idx >= horizon - 2))
+      {
+        return;
+      }
+
+      const float dt = traj_dt[0];
+
+      // read gradients:
+      // float g_pos= 0.0;
+      float out_grad = 0.0;
+      float g_pos[3];
+      float g_vel[5];
+      float g_acc[5];
+      float g_jerk[5];
+
+  #pragma unroll 5
+
+      for (int i = 0; i < 5; i++)
+      {
+        g_vel[i]  = 0.0;
+        g_acc[i]  = 0.0;
+        g_jerk[i] = 0.0;
+      }
+
+      const int hid = h_idx;
+
+      g_pos[0] = grad_position[b_addrs + (hid) * dof + d_idx];
+      g_pos[1] = 0.0;
+      g_pos[2] = 0.0;
+
+
+      if ((hid > 1) && (h_idx < horizon - 3))
+      {
+    #pragma unroll
+
+        for (int i = 0; i < 5; i++)
+        {
+          g_vel[i]  = grad_velocity[b_addrs + ((hid - 2) + i) * dof + d_idx];
+          g_acc[i]  = grad_acceleration[b_addrs + ((hid - 2) + i) * dof + d_idx];
+          g_jerk[i] = grad_jerk[b_addrs + ((hid - 2) + i) * dof + d_idx];
+        }
+        out_grad = (g_pos[0] +
+
+// ((-0.5) * g_vel[3] + (0.5) * g_vel[1]) * dt +
+
+                    ((-0.083333333f) * g_vel[0] + (0.666666667f) * g_vel[1] + (-0.666666667f) *
+                     g_vel[3] + (0.083333333f) * g_vel[4]) * dt +
+
+
+                    ((-0.083333333f) * g_acc[0] + (1.333333333f) * g_acc[1] + (-2.5f) * g_acc[2] +
+                     (1.333333333f) * g_acc[3] + (-0.083333333f) * g_acc[4]) * dt * dt +
+
+// ( g_acc[3] + g_acc[1] - (2.0) * g_acc[2]) * dt * dt +
+                    (0.5f * g_jerk[0] - g_jerk[1] + g_jerk[3] - 0.5f * g_jerk[4]) * dt * dt * dt);
+      }
+      else if (hid == horizon - 3)
+      {
+        // The below can cause oscilatory gradient steps.
+
+        /*
+         #pragma unroll
+           for (int i=0; i< 5; i++)
+           {
+           g_vel[i] = grad_velocity[b_addrs + ((hid - 2) + i)*dof + d_idx];
+           g_acc[i] = grad_acceleration[b_addrs + ((hid -2) + i)*dof + d_idx];
+           g_jerk[i] = grad_jerk[b_addrs + ((hid -2) + i)*dof + d_idx];
+           }
+         */
+        g_pos[1] = grad_position[b_addrs + (hid + 1) * dof + d_idx];
+        g_pos[2] = grad_position[b_addrs + (hid + 2) * dof + d_idx];
+
+        out_grad = (g_pos[0] + g_pos[1] + g_pos[2]);
+
+        /* +
+           //((0.5) * g_vel[1] + (0.5) * g_vel[2]) * dt +
+           ((-0.083333333f) * g_vel[0] + (0.583333333f) * g_vel[1] + (0.583333333f) * g_vel[2] +
+              (-0.083333333f) * g_vel[3]) * dt  +
+           ((-0.083333333f) * g_acc[0] + (1.25f) * g_acc[1] + (-1.25f) * g_acc[2] + (0.083333333f) *
+              g_acc[3]) * dt * dt +
+           //( g_acc[1] - g_acc[2]) * dt * dt +
+           (0.5f * g_jerk[0] - 0.5f * g_jerk[1] -0.5f * g_jerk[2] + 0.5f * g_jerk[3]) * dt * dt *
+              dt);
+         */
+      }
+
+
+      // write out:
+      out_grad_position[b_addrs_action + (h_idx - 2) * dof + d_idx] = out_grad;
+    }
+
+    // for MPPI:
+
+    template<typename scalar_t>
+    __global__ void
+    acceleration_loop_kernel(scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+                             scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+                             const scalar_t *u_acc, const scalar_t *start_position,
+                             const scalar_t *start_velocity,
+                             const scalar_t *start_acceleration,
+                             const scalar_t *traj_dt, const int batch_size,
+                             const int horizon, const int dof)
+    {
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      // read start state:
+      float u_arr[MAX_H], dt[MAX_H];
+      float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+      const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+
+      for (int i = 0; i < horizon; i++)
+      {
+        u_arr[i]    = u_acc[b_addrs + i * dof + d_idx];
+        dt[i]       = traj_dt[i];
+        out_pos[i]  = 0;
+        out_vel[i]  = 0;
+        out_acc[i]  = 0;
+        out_jerk[i] = 0;
+      }
+
+      out_pos[0]  = start_position[b_idx * dof + d_idx];
+      out_vel[0]  = start_velocity[b_idx * dof + d_idx];
+      out_acc[0]  = start_acceleration[b_idx * dof + d_idx];
+      out_jerk[0] = 0.0;
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        // do semi implicit euler integration:
+        out_acc[h_idx]  = u_arr[h_idx - 1];
+        out_vel[h_idx]  = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
+        out_pos[h_idx]  = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
+        out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_pos[h_idx]; // new_position;
+        out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_vel[h_idx];
+        out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_acc[h_idx];
+        out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+      }
+    }
+
+    template<typename scalar_t>
+    __global__ void acceleration_loop_rk2_kernel(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_acc, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const scalar_t *traj_dt, const int batch_size, const int horizon,
+      const int dof)
+    {
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      // read start state:
+      float u_arr[MAX_H], dt[MAX_H];
+      float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+      const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+
+      for (int i = 0; i < horizon; i++)
+      {
+        u_arr[i]    = u_acc[b_addrs + i * dof + d_idx];
+        dt[i]       = traj_dt[i];
+        out_pos[i]  = 0;
+        out_vel[i]  = 0;
+        out_acc[i]  = 0;
+        out_jerk[i] = 0;
+      }
+
+      out_pos[0]  = start_position[b_idx * dof + d_idx];
+      out_vel[0]  = start_velocity[b_idx * dof + d_idx];
+      out_acc[0]  = start_acceleration[b_idx * dof + d_idx];
+      out_jerk[0] = 0.0;
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        // do rk2 integration:
+
+        out_acc[h_idx]  = u_arr[h_idx - 1];
+        out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+        out_pos[h_idx]  = out_pos[h_idx - 1] + out_vel[h_idx - 1] * dt[h_idx] +
+                          0.5 * dt[h_idx] * dt[h_idx] * out_acc[h_idx];
+        out_vel[h_idx] = out_vel[h_idx - 1] + 0.5 * dt[h_idx] * out_acc[h_idx];
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_pos[h_idx]; // new_position;
+        out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_vel[h_idx];
+        out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_acc[h_idx];
+        out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+      }
+    }
+
+    template<typename scalar_t>
+    __global__ void acceleration_loop_idx_kernel(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_acc, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      // read start state:
+      float u_arr[MAX_H], dt[MAX_H];
+      float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+      const int b_addrs  = b_idx * horizon * dof;
+      const int b_offset = start_idx[b_idx];
+
+#pragma unroll
+
+      for (int i = 0; i < horizon; i++)
+      {
+        u_arr[i]    = u_acc[b_addrs + i * dof + d_idx];
+        dt[i]       = traj_dt[i];
+        out_pos[i]  = 0;
+        out_vel[i]  = 0;
+        out_acc[i]  = 0;
+        out_jerk[i] = 0;
+      }
+
+      out_pos[0]  = start_position[b_offset * dof + d_idx];
+      out_vel[0]  = start_velocity[b_offset * dof + d_idx];
+      out_acc[0]  = start_acceleration[b_offset * dof + d_idx];
+      out_jerk[0] = 0.0;
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        // do semi implicit euler integration:
+        out_acc[h_idx]  = u_arr[h_idx - 1];
+        out_vel[h_idx]  = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
+        out_pos[h_idx]  = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
+        out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_pos[h_idx]; // new_position;
+        out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_vel[h_idx];
+        out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_acc[h_idx];
+        out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+      }
+    }
+
+    template<typename scalar_t>
+    __global__ void acceleration_loop_idx_rk2_kernel(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_acc, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      // read start state:
+      float u_arr[MAX_H], dt[MAX_H];
+      float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+      const int b_addrs  = b_idx * horizon * dof;
+      const int b_offset = start_idx[b_idx];
+
+#pragma unroll
+
+      for (int i = 0; i < horizon; i++)
+      {
+        u_arr[i]    = u_acc[b_addrs + i * dof + d_idx];
+        dt[i]       = traj_dt[i];
+        out_pos[i]  = 0;
+        out_vel[i]  = 0;
+        out_acc[i]  = 0;
+        out_jerk[i] = 0;
+      }
+
+      out_pos[0]  = start_position[b_offset * dof + d_idx];
+      out_vel[0]  = start_velocity[b_offset * dof + d_idx];
+      out_acc[0]  = start_acceleration[b_offset * dof + d_idx];
+      out_jerk[0] = 0.0;
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        // do semi implicit euler integration:
+
+        out_acc[h_idx]  = u_arr[h_idx - 1];
+        out_vel[h_idx]  = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
+        out_pos[h_idx]  = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
+        out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_pos[h_idx]; // new_position;
+        out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_vel[h_idx];
+        out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+          out_acc[h_idx];
+        out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+      }
+    }
+
+    // Not used
+
+    template<typename scalar_t>
+    __global__ void position_clique_kernel(
+      scalar_t *out_position, scalar_t *out_velocity, scalar_t *out_acceleration,
+      scalar_t *out_jerk, const scalar_t *u_position,
+      const scalar_t *start_position, const scalar_t *start_velocity,
+      const scalar_t *start_acceleration, const scalar_t *traj_dt,
+      const int batch_size, const int horizon, const int dof)
+    {
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (horizon * dof);
+      const int h_idx = (tid - b_idx * (horizon * dof)) / dof;
+      const int d_idx = (tid - b_idx * horizon * dof - h_idx * dof);
+
+      if ((b_idx >= batch_size) || (h_idx >= horizon) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      float new_position, new_velocity, new_acceleration, new_jerk;
+      const float dt = traj_dt[h_idx];
+
+      // read actions: batch, horizon
+      if (h_idx == 0)
+      {
+        new_position     = start_position[b_idx * dof + d_idx];
+        new_velocity     = start_velocity[b_idx * dof + d_idx];
+        new_acceleration = start_acceleration[b_idx * dof + d_idx];
+        new_jerk         = 0.0;
+      }
+      else if (h_idx == 1)
+      {
+        float2 u_clique = make_float2(
+          start_position[b_idx * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+        new_position     = u_clique.y;
+        new_velocity     = (u_clique.y - u_clique.x) * dt;    // 1 - 0
+        new_acceleration =
+          (u_clique.y - u_clique.x) * dt * dt;                // 2 - 2.0 * 1 + 1
+        new_jerk = (u_clique.y - u_clique.x) *  dt * dt * dt; // -1 3 -3 1
+      }
+      else if (h_idx == 2)
+      {
+        float3 u_clique = make_float3(
+          start_position[b_idx * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+
+        new_position     = u_clique.z;
+        new_velocity     = (u_clique.z - u_clique.y) * dt; // 1 - 0
+        new_acceleration = (u_clique.x - 2 * u_clique.y + u_clique.z) *
+                           dt * dt;                        // 2 - 2.0 * 1 + 1
+        new_jerk = (2 * u_clique.x - 3 * u_clique.y + u_clique.z) *
+                   dt * dt * dt;                           // -1 3 -3 1
+      }
+      else if (h_idx == 3)
+      {
+        float4 u_clique = make_float4(
+          start_position[b_idx * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 3) * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+        new_position     = u_clique.w;
+        new_velocity     = (u_clique.w - u_clique.z) * dt; // 1 - 0
+        new_acceleration = (u_clique.y - 2 * u_clique.z + u_clique.w) *
+                           dt * dt;                        // 2 - 2.0 * 1 + 1
+        new_jerk =
+          (-1.0 * u_clique.x + 3 * u_clique.y - 3 * u_clique.z + u_clique.w) *
+          dt * dt * dt;
+      }
+      else
+      {
+        float4 u_clique = make_float4(
+          u_position[b_idx * horizon * dof + (h_idx - 4) * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 3) * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
+          u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+        new_position     = u_clique.w;
+        new_velocity     = (u_clique.w - u_clique.z) * dt; // 1 - 0
+        new_acceleration = (u_clique.y - 2 * u_clique.z + u_clique.w) *
+                           dt * dt;                        // 2 - 2.0 * 1 + 1
+        new_jerk =
+          (-1.0 * u_clique.x + 3 * u_clique.y - 3 * u_clique.z + u_clique.w) *
+          dt * dt * dt;
+      }
+
+      // h_idx = h_idx + 1;
+      out_position[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        new_position; // new_position;
+      out_velocity[b_idx * horizon * dof + h_idx * dof + d_idx]     = new_velocity;
+      out_acceleration[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        new_acceleration;
+      out_jerk[b_idx * horizon * dof + h_idx * dof + d_idx] = new_jerk;
+    }
+
+    // Not used
+    template<typename scalar_t>
+    __global__ void position_clique_loop_coalesce_kernel(
+      scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+      scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+      const scalar_t *u_position, const scalar_t *start_position,
+      const scalar_t *start_velocity, const scalar_t *start_acceleration,
+      const scalar_t *traj_dt, const int batch_size, const int horizon,
+      const int dof)
+    {
+      // data is stored as batch, dof, horizon
+      // there are batch * horizon * dof threads:
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+
+      const float dt =
+        traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+
+      // read start state:
+      float u_arr[MAX_H];
+      float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+      const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+
+      for (int i = 0; i < horizon; i++)
+      {
+        u_arr[i]    = u_position[b_addrs + d_idx * horizon + i];
+        out_pos[i]  = 0;
+        out_vel[i]  = 0;
+        out_acc[i]  = 0;
+        out_jerk[i] = 0;
+      }
+
+      out_pos[0]  = start_position[b_idx * dof + d_idx];
+      out_vel[0]  = start_velocity[b_idx * dof + d_idx];
+      out_acc[0]  = start_acceleration[b_idx * dof + d_idx];
+      out_jerk[0] = 0.0;
+
+      for (int h_idx = 1; h_idx < horizon; h_idx++)
+      {
+        // read actions: batch, horizon
+
+        out_pos[h_idx] = u_arr[h_idx - 1];
+
+        out_vel[h_idx] = (out_pos[h_idx] - out_pos[h_idx - 1]) * dt;  // 1 - 0
+        out_acc[h_idx] =
+          (out_vel[h_idx] - out_vel[h_idx - 1]) * dt;                 // 2 - 2.0 * 1 + 1
+        out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) * dt; // -1 3 -3 1
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        out_position_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+          out_pos[h_idx]; // new_position;
+        out_velocity_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+          out_vel[h_idx];
+        out_acceleration_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+          out_acc[h_idx];
+        out_jerk_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+          out_jerk[h_idx];
+      }
+    }
+
+    // Not used
+    template<typename scalar_t>
+    __global__ void backward_position_clique_loop_coalesce_kernel(
+      scalar_t *out_grad_position, const scalar_t *grad_position,
+      const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+      const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (dof);
+      const int d_idx = (tid - b_idx * dof);
+
+      if ((b_idx >= batch_size) || (d_idx >= dof))
+      {
+        return;
+      }
+      const int b_addrs = b_idx * horizon * dof;
+
+      // read gradients:
+      float g_pos[MAX_H];
+      float g_vel[MAX_H];
+      float g_acc[MAX_H];
+      float g_jerk[MAX_H];
+      const float dt   = traj_dt[0];
+      const float dt_2 = dt * dt;
+      const float dt_3 = dt * dt * dt;
+#pragma unroll
+
+      for (int h_idx = 0; h_idx < horizon; h_idx++)
+      {
+        g_pos[h_idx]  = grad_position[b_addrs + d_idx * horizon + h_idx];
+        g_vel[h_idx]  = grad_velocity[b_addrs + d_idx * horizon + h_idx];
+        g_acc[h_idx]  = grad_acceleration[b_addrs + d_idx * horizon + h_idx];
+        g_jerk[h_idx] = grad_jerk[b_addrs + d_idx * horizon + h_idx];
+      }
+#pragma unroll
+
+      for (int i = 0; i < 4; i++)
+      {
+        g_vel[horizon + i]  = 0.0;
+        g_acc[horizon + i]  = 0.0;
+        g_jerk[horizon + i] = 0.0;
+      }
+
+      // compute gradient and sum
+      for (int h_idx = 0; h_idx < horizon - 1; h_idx++)
+      {
+        g_pos[h_idx + 1] +=
+          ((g_vel[h_idx + 1] - g_vel[h_idx + 2]) * dt +
+           (g_acc[h_idx + 1] - 2 * g_acc[h_idx + 2] + g_acc[h_idx + 3]) * dt_2 +
+           (1 * g_jerk[h_idx + 1] - 3 * g_jerk[h_idx + 2] +
+            3 * g_jerk[h_idx + 3] - g_jerk[h_idx + 4]) *
+           dt_3);
+      }
+
+      // write out:
+      for (int h_idx = 0; h_idx < horizon - 1; h_idx++)
+      {
+        out_grad_position[b_addrs + d_idx * horizon + h_idx] = g_pos[h_idx + 1];
+      }
+      out_grad_position[b_addrs + d_idx * horizon + horizon - 1] = 0.0;
+    }
+
+    // Not used
+    template<typename scalar_t>
+    __global__ void backward_position_clique_kernel(
+      scalar_t *out_grad_position, const scalar_t *grad_position,
+      const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+      const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+      const int horizon, const int dof)
+    {
+      // TODO: transpose h and dof to be able to directly read float2, float3, etc..
+      const int tid   = blockDim.x * blockIdx.x + threadIdx.x;
+      const int b_idx = tid / (horizon * dof);
+      const int h_idx = (tid - b_idx * (horizon * dof)) / dof;
+      const int d_idx = (tid - b_idx * horizon * dof - h_idx * dof);
+
+      if ((b_idx >= batch_size) || (h_idx >= horizon) || (d_idx >= dof))
+      {
+        return;
+      }
+      const int b_addrs = b_idx * horizon * dof;
+
+      if (h_idx == horizon - 1)
+      {
+        out_grad_position[b_addrs + (h_idx) * dof + d_idx] = 0.0;
+        return;
+      }
+
+      // read gradients:
+      const float dt = traj_dt[0];
+      float g_u      = grad_position[b_addrs + (h_idx + 1) * dof + d_idx];
+
+      float2 g_vel;
+      float3 g_acc;
+      float4 g_jerk;
+
+      if (h_idx < horizon - 4) // && h_idx > 0)
+      {
+        g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+        g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
+                            grad_acceleration[b_addrs + (h_idx + 3) * dof + d_idx]);
+
+        g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 4) * dof + d_idx]);
+        g_u +=
+          ((g_vel.x - g_vel.y) * dt +
+           (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+           (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+      }
+      else if (h_idx == 0)
+      {
+        g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+
+        g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
+                            0.0);
+
+        g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx], 0.0);
+        g_u += ((g_vel.x - g_vel.y) * dt +
+                (-1.0 * g_acc.x + 1 * g_acc.y) * dt * dt +
+                (-1 * g_jerk.x + 2 * g_jerk.y - 1 * g_jerk.z) * dt * dt * dt);
+      }
+      else if (h_idx == horizon - 4)
+      {
+        g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+        g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
+                            grad_acceleration[b_addrs + (h_idx + 3) * dof + d_idx]);
+        g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
+                             grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx], 0.0);
+        g_u +=
+          ((g_vel.x - g_vel.y) * dt +
+           (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+           (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+      }
+      else if (h_idx == horizon - 3)
+      {
+        g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                            grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+        g_acc =
+          make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                      grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx], 0);
+        g_jerk =
+          make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                      grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx], 0.0, 0.0);
+        g_u +=
+          ((g_vel.x - g_vel.y) * dt +
+           (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+           (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+      }
+      else if (h_idx == horizon - 2)
+      {
+        g_vel =
+          make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx], 0.0);
+        g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                            0, 0);
+        g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx], 0.0,
+                             0.0, 0.0);
+        g_u +=
+          ((g_vel.x - g_vel.y) * dt +
+           (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+           (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+      }
+
+      out_grad_position[b_addrs + (h_idx) * dof + d_idx] = g_u;
+    }
+  } // namespace
+}
+std::vector<torch::Tensor>step_position_clique(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof)
 {
-
-template <typename scalar_t>
-__global__ void position_clique_loop_kernel(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof) {
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);  
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-
-  const float dt =
-      traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
-
-  // read start state:
-  float u_arr[MAX_H];
-  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
-  const int b_addrs = b_idx * horizon * dof;
-
-#pragma unroll
-  for (int i = 0; i < horizon; i++) {
-    u_arr[i] = u_position[b_addrs + i * dof + d_idx];
-    out_pos[i] = 0;
-    out_vel[i] = 0;
-    out_acc[i] = 0;
-    out_jerk[i] = 0;
-  }
-
-  out_pos[0] = start_position[b_idx * dof + d_idx];
-  out_vel[0] = start_velocity[b_idx * dof + d_idx];
-  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
-  out_jerk[0] = 0.0;
-
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-
-    // read actions: batch, horizon
-    out_pos[h_idx] = u_arr[h_idx - 1];
-    out_vel[h_idx] = (out_pos[h_idx] - out_pos[h_idx - 1]) * dt; // 1 - 0
-    out_acc[h_idx] =
-        (out_vel[h_idx] - out_vel[h_idx - 1]) * dt; // 2 - 2.0 * 1 + 1
-    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) * dt; // -1 3 -3 1
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_pos[h_idx]; // new_position;
-    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_vel[h_idx];
-    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_acc[h_idx];
-    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
-  }
-}
-
-template <typename scalar_t>
-__device__ __forceinline__ void compute_backward_difference(scalar_t *out_position_mem, 
-    scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof,
-    const int b_idx,
-    const int h_idx,
-    const int d_idx,
-    const int b_offset) {
-
-  const float dt =
-      traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
-
-  // read start state:
-  float in_pos[4];
-  float st_pos=0.0, st_vel=0.0, st_acc = 0.0;
-
-  #pragma unroll 4
-  for (int i=0; i< 4; i++)
-  {
-    in_pos[i] = 0.0;
-  }
-
-  float out_pos=0.0, out_vel=0.0, out_acc=0.0, out_jerk=0.0;
-  const int b_addrs = b_idx * horizon * dof;
-  if (h_idx < 4){
-    st_pos = start_position[b_offset * dof + d_idx];
-    st_vel = start_velocity[b_offset * dof + d_idx];
-    st_acc = start_acceleration[b_offset * dof + d_idx];
-  }
-  if (h_idx == 0) {
-    out_pos = st_pos;
-    out_vel = st_vel;
-    out_acc = st_acc; //
-  } 
-  else {
-    if (h_idx == 1) {
-      
-      for(int i=3; i<4; i++)
-      {
-        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
-      }
-
-
-      in_pos[0] = st_pos - 2.0 * (st_vel - 0.5 * st_acc*dt) * dt;
-
-      in_pos[1] = st_pos - (st_vel - 0.5 * st_acc*dt) * dt;
-
-      in_pos[2] = st_pos;
-    
-
-    } else if (h_idx == 2) {
-
-
-      for(int i=2; i<4; i++)
-      {
-        in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i)*dof + d_idx];
-      }
-
-      in_pos[0] = st_pos - (st_vel - 0.5 * st_acc*dt) * dt;
-
-      in_pos[1] = st_pos;
-
-
-      
-      
-    } 
-    else if (h_idx == 3)
-    {
-
-      for(int i=1; i<4; i++)
-      {
-        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
-      }
-      in_pos[0] = st_pos;
-    
-    }
-    else {                                 // h_idx >= 4
-
-      for(int i=0; i<4; i++)
-      {
-        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
-      }
-    }
-
-      out_pos = in_pos[3];
-      out_vel = (-in_pos[2] + in_pos[3]) * dt;
-      out_acc = (in_pos[1] - 2 * in_pos[2] + in_pos[3]) * dt * dt;
-      out_jerk = (-in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
-  
-  }
-
-  
-  // write out:
-  out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-      out_pos; // new_position;
-  out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_vel;
-  out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
-  out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk;
-}
-
-
-template <typename scalar_t>
-__device__ __forceinline__ void compute_central_difference_v0(scalar_t *out_position_mem, 
-    scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof,
-    const int b_idx,
-    const int h_idx,
-    const int d_idx,
-    const int b_offset) {
-
-    
-  const float dt = traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
-  // dt here is actually 1/dt;
-
-  // read start state:
-  float out_pos=0.0, out_vel=0.0, out_acc=0.0, out_jerk=0.0;
-  float st_pos=0.0, st_vel=0.0, st_acc = 0.0;
-
-  const int b_addrs = b_idx * horizon * dof;
-  float in_pos[5]; // create a 5 value scalar
-
-  #pragma unroll 5
-  for (int i=0; i< 5; i ++){
-    in_pos[i] = 0.0;
-  }
-  if (h_idx < 4){
-    st_pos = start_position[b_offset * dof + d_idx];
-    st_vel = start_velocity[b_offset * dof + d_idx];
-    st_acc = start_acceleration[b_offset * dof + d_idx];
-  }
-  if (h_idx == 0)
-  {
-    out_pos = st_pos;
-    out_vel = st_vel;
-    out_acc = st_acc; 
-  }
-  else if (h_idx < horizon - 2)
-  {
-    if (h_idx == 1)
-    {
-
-      in_pos[0] = st_pos - dt * ( st_vel - (0.5 * st_acc * dt)); // start -1, start, u0, u1
-      in_pos[1] = st_pos; 
-      in_pos[2] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
-      in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
-      in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
-      
-    }
-    else if (h_idx == 2)
-    {
-      in_pos[0] = start_position[b_offset * dof + d_idx];
-      in_pos[1] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
-      in_pos[2] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
-      in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
-      in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];\
-    
-    }
-  
-    
-    else if (h_idx > 2)
-    {
-      in_pos[0] = u_position[b_addrs + (h_idx - 1 - 2) * dof + d_idx];
-      in_pos[1] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
-      in_pos[2] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
-      in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
-      in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
-    
-    }
-    out_pos = in_pos[2];
-    out_vel = (0.5 * in_pos[3] - 0.5 * in_pos[1]) * dt;
-    out_acc = (in_pos[3] + in_pos[1] - 2 * in_pos[2]) * dt * dt;     
-    out_jerk = ((-0.5)*in_pos[0] + in_pos[1]  - in_pos[3] + (0.5)*in_pos[4]) * (dt * dt * dt);
-
-  }
-  else if (h_idx == horizon -2)
-  {
-    // use backward difference for jerk
-    
-    in_pos[0] = u_position[b_addrs + (h_idx - 1 - 3) * dof + d_idx];
-    in_pos[1] = u_position[b_addrs + (h_idx - 1 - 2) * dof + d_idx];
-    in_pos[2] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
-    in_pos[3] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
-    in_pos[4] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
-
-    out_pos = in_pos[3];
-    out_vel = (0.5 * in_pos[4] - 0.5 * in_pos[2]) * dt;
-    out_acc = (in_pos[4] + in_pos[2] - 2 * in_pos[3]) * dt * dt;
-    out_jerk= (-1 * in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3] ) * dt * dt * dt;
-  }
-  else if (h_idx == horizon -1)
-  { // use backward difference for vel, acc, jerk
-    for(int i=0; i<4; i++)
-    {
-        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
-    }
-    out_pos = in_pos[3];
-    out_vel = (-in_pos[2] + in_pos[3]) * dt;
-    out_acc = (in_pos[1] - 2 * in_pos[2] + in_pos[3]) * dt * dt;
-    out_jerk = (-in_pos[0] + 3 * in_pos[1] -3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
-  }
-  // write out:
-  out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_pos; 
-  out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_vel;
-  out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
-  out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk;
-}
-
-
-template <typename scalar_t>
-__device__ __forceinline__ void compute_central_difference(scalar_t *out_position_mem, 
-    scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof,
-    const int b_idx,
-    const int h_idx,
-    const int d_idx,
-    const int b_offset) {
-
-    
-  const float dt = traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
-  // dt here is actually 1/dt;
-  const float dt_inv = 1.0 / dt;
-  const float st_jerk = 0.0; // Note: start jerk can also be passed from global memory
-  // read start state:
-  float out_pos=0.0, out_vel=0.0, out_acc=0.0, out_jerk=0.0;
-  float st_pos=0.0, st_vel=0.0, st_acc = 0.0;
-
-  const int b_addrs = b_idx * horizon * dof;
-  const int b_addrs_action = b_idx * (horizon-4) * dof;
-  float in_pos[5]; // create a 5 value scalar
-  const float acc_scale = 1.0;
-  #pragma unroll 5
-  for (int i=0; i< 5; i ++){
-    in_pos[i] = 0.0;
-  }
-  if (h_idx < 5){
-    st_pos = start_position[b_offset * dof + d_idx];
-    st_vel = start_velocity[b_offset * dof + d_idx];
-    st_acc = start_acceleration[b_offset * dof + d_idx];
-  }
-  
-  if (h_idx > 3 && h_idx < horizon - 4){
-      in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
-      in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
-      in_pos[2] = u_position[b_addrs_action + (h_idx - 2 ) * dof + d_idx];
-      in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
-      in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
-      
-    }
-
-    
-  else if (h_idx == 0)
-  {
-    
-    
-
-    
-    in_pos[0] = (3.0f/2) * ( - 1 * st_acc * (dt_inv * dt_inv) - (dt_inv * dt_inv * dt_inv) * st_jerk ) - 3.0f * dt_inv * st_vel + st_pos;
-    in_pos[1] = -2.0f * st_acc * dt_inv * dt_inv - (4.0f/3) * dt_inv * dt_inv * dt_inv * st_jerk - 2.0 * dt_inv * st_vel + st_pos;
-    in_pos[2] = -(3.0f/2) * st_acc * dt_inv * dt_inv - (7.0f/6) * dt_inv * dt_inv * dt_inv * st_jerk - dt_inv * st_vel + st_pos;
-    in_pos[3] = st_pos;
-    in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
-   
-
-    
-    
-
-
-  }
-    
-  else if (h_idx == 1)
-  {
-    
-    in_pos[0] = -2.0f * st_acc * dt_inv * dt_inv - (4.0f/3) * dt_inv * dt_inv * dt_inv * st_jerk - 2.0 * dt_inv * st_vel + st_pos;
-    in_pos[1] = -(3.0f/2) * st_acc * dt_inv * dt_inv - (7.0f/6) * dt_inv * dt_inv * dt_inv * st_jerk - dt_inv * st_vel + st_pos;
-  
-    
-    in_pos[2] = st_pos; 
-    in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
-    in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
-
-    
-  }
-  
-  else if (h_idx == 2)
-  {
-    in_pos[0] = -(3.0f/2) * st_acc * dt_inv * dt_inv - (7.0f/6) * dt_inv * dt_inv * dt_inv * st_jerk - dt_inv * st_vel + st_pos;
-    in_pos[1] = st_pos; 
-    in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
-    in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
-    in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
-
-      
-  }
-  else if (h_idx == 3)
-  {
-    in_pos[0] = st_pos;
-    in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
-    in_pos[2] = u_position[b_addrs_action + (h_idx - 2 ) * dof + d_idx];
-    in_pos[3] = u_position[b_addrs_action + (h_idx - 1 ) * dof + d_idx];
-    in_pos[4] = u_position[b_addrs_action + (h_idx) * dof + d_idx];
-  
-  }
-  
-  else if (h_idx == horizon - 4)
-  {
-    in_pos[0] = u_position[b_addrs_action + (h_idx -4) * dof + d_idx];
-    in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
-    in_pos[2] = u_position[b_addrs_action + (h_idx - 2 ) * dof + d_idx];
-    in_pos[3] = u_position[b_addrs_action + (h_idx - 1) * dof + d_idx];
-    in_pos[4] = in_pos[3];//in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof + d_idx];
-
-  }
-  
-  else if (h_idx == horizon - 3)
-  {
-    in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
-    in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
-    in_pos[2] = u_position[b_addrs_action + (h_idx - 2) * dof + d_idx];
-    in_pos[3] = in_pos[2];//u_position[b_addrs_action + (h_idx - 1 + 1) * dof + d_idx];
-    in_pos[4] = in_pos[2];//in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof + d_idx];
-
-  }
-  else if (h_idx == horizon - 2)
-  {
-    in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
-    in_pos[1] = u_position[b_addrs_action + (h_idx - 3) * dof + d_idx];
-    in_pos[2] = in_pos[1];
-    in_pos[3] = in_pos[1];//u_position[b_addrs_action + (h_idx - 1 + 1) * dof + d_idx];
-    in_pos[4] = in_pos[1];//in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof + d_idx];
-
-  }
-  
-  else if (h_idx == horizon - 1)
-  {
-    in_pos[0] = u_position[b_addrs_action + (h_idx - 4) * dof + d_idx];
-    in_pos[1] = in_pos[0];
-    in_pos[2] = in_pos[0];//u_position[b_addrs_action + (h_idx - 1 ) * dof + d_idx];
-    in_pos[3] = in_pos[0];//u_position[b_addrs_action + (h_idx - 1 + 1) * dof + d_idx];
-    in_pos[4] = in_pos[0];//in_pos[3]; //u_position[b_addrs_action + (h_idx - 1 + 2) * dof + d_idx];
-  }
-  out_pos = in_pos[2];
-  
-  // out_vel = (0.5 * in_pos[3] - 0.5 * in_pos[1]) * dt;
-  out_vel = ((0.083333333f) * in_pos[0] - (0.666666667f) * in_pos[1] + (0.666666667f) * in_pos[3] + (-0.083333333f) * in_pos[4]) * dt;
-
-  // out_acc = (in_pos[3] + in_pos[1] - 2.0 * in_pos[2]) * dt * dt;     
-  out_acc = ((-0.083333333f) * in_pos[0] + (1.333333333f) * in_pos[1] + (-2.5f) * in_pos[2] + (1.333333333f) * in_pos[3] + (-0.083333333f) * in_pos[4]) * dt * dt;
-  out_jerk = ((-0.5f)*in_pos[0] + in_pos[1]  - in_pos[3] + (0.5f)*in_pos[4]) * (dt * dt * dt);
-  
-  // write out:
-  out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_pos; 
-  out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_vel;
-  out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
-  out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk;
-}
-
-
-template <typename scalar_t, int8_t mode>
-__global__ void position_clique_loop_kernel2(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // number of threads = batch_size * dof * horizon;
-  const int h_idx = tid % horizon;
-  const int d_idx = (tid / horizon) % dof;
-  const int b_idx = tid / (dof * horizon);
-  if (tid >= batch_size * dof * horizon) {
-    return;
-  }
-
-
-  const int b_offset = b_idx;
-  if (mode == BWD_DIFF)
-  {
-    compute_backward_difference(out_position_mem, out_velocity_mem,
-  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
-  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
-  }
-  else if (mode == CENTRAL_DIFF)
-  {
-
-  compute_central_difference(out_position_mem, out_velocity_mem,
-  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
-  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
-
-  }
-  else
-  {
-    assert(false);
-  }
-  
-}
-
-
-template <typename scalar_t, int8_t mode>
-__global__ void position_clique_loop_idx_kernel2(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // number of threads = batch_size * dof * horizon;
-  const int h_idx = tid % horizon;
-  const int d_idx = (tid / horizon) % dof;
-  const int b_idx = tid / (dof * horizon);
-  if (tid >= batch_size * dof * horizon) {
-    return;
-  }
-
-
-  const int b_offset = start_idx[b_idx];
-   if (mode == BWD_DIFF)
-  {
-    compute_backward_difference(out_position_mem, out_velocity_mem,
-  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
-  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
-  }
-  else if (mode == CENTRAL_DIFF)
-  {
-
-  compute_central_difference(out_position_mem, out_velocity_mem,
-  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
-  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
-
-  }
-  else
-  {
-    assert(false);
-  }
-}
-
-template <typename scalar_t>
-__global__ void backward_position_clique_loop_kernel(
-    scalar_t *out_grad_position, const scalar_t *grad_position,
-    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
-    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-  const int b_addrs = b_idx * horizon * dof;
-  // read gradients:
-  float g_pos[MAX_H];
-  float g_vel[MAX_H];
-  float g_acc[MAX_H];
-  float g_jerk[MAX_H];
-  const float dt = traj_dt[0];
-  const float dt_2 = dt * dt ;//dt * dt;
-  const float dt_3 = dt * dt * dt;//dt * dt * dt;
-
-  // not used index == 0
-  g_pos[0] = 0.0;
-  g_vel[0] = 0.0;
-  g_acc[0] = 0.0;
-  g_jerk[0] = 0.0;
-#pragma unroll
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-    g_pos[h_idx] = grad_position[b_addrs + (h_idx)*dof + d_idx];
-    g_vel[h_idx] = grad_velocity[b_addrs + (h_idx)*dof + d_idx];
-    g_acc[h_idx] = grad_acceleration[b_addrs + (h_idx)*dof + d_idx];
-    g_jerk[h_idx] = grad_jerk[b_addrs + (h_idx)*dof + d_idx];
-  }
-#pragma unroll
-  for (int i = 0; i < 4; i++) {
-    g_vel[horizon + i] = 0.0;
-    g_acc[horizon + i] = 0.0;
-    g_jerk[horizon + i] = 0.0;
-  }
-  // compute gradient and sum
-  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
-    g_pos[h_idx + 1] +=
-        ((g_vel[h_idx + 1] - g_vel[h_idx + 2]) * dt +
-         (g_acc[h_idx + 1] - 2 * g_acc[h_idx + 2] + g_acc[h_idx + 3]) * dt_2 +
-         (g_jerk[h_idx + 1] - 3 * g_jerk[h_idx + 2] + 3 * g_jerk[h_idx + 3] -
-          g_jerk[h_idx + 4]) *
-             dt_3);
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
-    out_grad_position[b_addrs + (h_idx)*dof + d_idx] = g_pos[h_idx + 1];
-  }
-  out_grad_position[b_addrs + (horizon - 1) * dof + d_idx] = 0.0;
-}
-
-
-
-template <typename scalar_t>
-__global__ void backward_position_clique_loop_backward_difference_kernel2(
-    scalar_t *out_grad_position, const scalar_t *grad_position,
-    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
-    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // number of threads = batch_size * dof * horizon;
-  const int h_idx = tid % horizon;
-  const int d_idx = (tid / horizon) % dof;
-  const int b_idx = tid / (dof * horizon);
-  if (tid >= batch_size * dof * horizon) {
-    return;
-  }
-  const int b_addrs = b_idx * horizon * dof;
-  if (h_idx == 0)
-  {
-    return;
-  }
-  
-
-  const float dt = traj_dt[0];
-
-  // read gradients:
-  float g_pos= 0.0;
-  float out_grad = 0.0;
-  float g_vel[4];
-  float g_acc[4];
-  float g_jerk[4];
-
-  #pragma unroll 4
-  for (int i=0; i<4; i++)
-  {
-    g_vel[i] = 0.0;
-    g_acc[i] = 0.0;
-    g_jerk[i] = 0.0;
-  }
-
-  int hid = h_idx;// + 1;
-
-  g_pos = grad_position[b_addrs + (hid)*dof + d_idx];
-
-
-
-  if(hid < horizon - 3)
-  {
-
-    for (int i=0; i< 4; i++)
-    {
-    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
-    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
-    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
-    }
-
-
-  }
-  else if (hid == horizon -3)
-  {
-    for (int i=0; i< 3; i++)
-    {
-    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
-    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
-    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
-    }
-  }
-  
-  
-  else if (hid == horizon -2)
-  {
-    for (int i=0; i< 2; i++)
-    {
-    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
-    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
-    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
-    }
-  }
-  else if (hid == horizon -1)
-  {
-      for (int i=0; i< 1; i++)
-    {
-    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
-    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
-    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
-    }
-
-  }
-    
-
-
-  out_grad = (g_pos + 
-              (g_vel[0] -  g_vel[1]) * dt + 
-              (g_acc[0] -2 * g_acc[1] + g_acc[2]) * dt * dt +
-              (g_jerk[0] - 3 * g_jerk[1] + 3 * g_jerk[2] - g_jerk[3]) * dt * dt * dt); 
-  
-
-
-  // write out:
-  out_grad_position[b_addrs + (h_idx-1)*dof + d_idx] = out_grad;
-  
-}
-
-template <typename scalar_t>
-__global__ void backward_position_clique_loop_central_difference_kernel2(
-    scalar_t *out_grad_position, const scalar_t *grad_position,
-    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
-    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // number of threads = batch_size * dof * horizon;
-  const int h_idx = tid % horizon;
-  const int d_idx = (tid / horizon) % dof;
-  const int b_idx = tid / (dof * horizon);
-  if (tid >= batch_size * dof * horizon) {
-    return;
-  }
-  const int b_addrs = b_idx * horizon * dof;
-  const int b_addrs_action = b_idx * (horizon-4) * dof;
-
-  if (h_idx < 2 || h_idx >= horizon - 2)
-  {
-    return;
-  }
-
-  const float dt = traj_dt[0];
-
-  // read gradients:
-  //float g_pos= 0.0;
-  float out_grad = 0.0;
-  float g_pos[3];
-  float g_vel[5];
-  float g_acc[5];
-  float g_jerk[5];
-
-  #pragma unroll 5
-  for (int i=0; i<5; i++)
-  {
-    g_vel[i] = 0.0;
-    g_acc[i] = 0.0;
-    g_jerk[i] = 0.0;
-  }
-
-  const int hid = h_idx;
-
-  g_pos[0] = grad_position[b_addrs + (hid)*dof + d_idx];
-  g_pos[1] = 0.0;
-  g_pos[2] = 0.0;
-
-
-  if(hid > 1 && h_idx < horizon - 3)
-  { 
-    #pragma unroll
-    for (int i=0; i< 5; i++)
-    {
-    g_vel[i] = grad_velocity[b_addrs + ((hid -2) + i)*dof + d_idx];
-    g_acc[i] = grad_acceleration[b_addrs + ((hid -2) + i)*dof + d_idx];
-    g_jerk[i] = grad_jerk[b_addrs + ((hid -2) + i)*dof + d_idx];
-    }
-    out_grad = (g_pos[0] + 
-          //((-0.5) * g_vel[3] + (0.5) * g_vel[1]) * dt + 
-
-          ((-0.083333333f) * g_vel[0] + (0.666666667f) * g_vel[1] + (-0.666666667f) * g_vel[3] + (0.083333333f) * g_vel[4]) * dt +
-
-
-          
-          ((-0.083333333f) * g_acc[0] + (1.333333333f) * g_acc[1] + (-2.5f) * g_acc[2] + (1.333333333f) * g_acc[3] + (-0.083333333f) * g_acc[4]) * dt * dt +
-          //( g_acc[3] + g_acc[1] - (2.0) * g_acc[2]) * dt * dt +
-          (0.5f * g_jerk[0] - g_jerk[1] + g_jerk[3] - 0.5f * g_jerk[4]) * dt * dt * dt);
-    
-  }
-  else if (hid == horizon - 3) 
-  {
-    //The below can cause oscilatory gradient steps.
-    
-    /*
-    #pragma unroll
-    for (int i=0; i< 5; i++)
-    {
-    g_vel[i] = grad_velocity[b_addrs + ((hid - 2) + i)*dof + d_idx];
-    g_acc[i] = grad_acceleration[b_addrs + ((hid -2) + i)*dof + d_idx];
-    g_jerk[i] = grad_jerk[b_addrs + ((hid -2) + i)*dof + d_idx];
-    }
-    */
-    g_pos[1] = grad_position[b_addrs + (hid + 1)*dof + d_idx];
-    g_pos[2] = grad_position[b_addrs + (hid + 2)*dof + d_idx];
-
-    out_grad = (g_pos[0] + g_pos[1] + g_pos[2]);
-          /* +  
-          //((0.5) * g_vel[1] + (0.5) * g_vel[2]) * dt + 
-          ((-0.083333333f) * g_vel[0] + (0.583333333f) * g_vel[1] + (0.583333333f) * g_vel[2] + (-0.083333333f) * g_vel[3]) * dt  +
-          ((-0.083333333f) * g_acc[0] + (1.25f) * g_acc[1] + (-1.25f) * g_acc[2] + (0.083333333f) * g_acc[3]) * dt * dt +
-          //( g_acc[1] - g_acc[2]) * dt * dt +
-          (0.5f * g_jerk[0] - 0.5f * g_jerk[1] -0.5f * g_jerk[2] + 0.5f * g_jerk[3]) * dt * dt * dt);
-          */
-  }
- 
-  
-  // write out:
-  out_grad_position[b_addrs_action + (h_idx-2)*dof + d_idx] = out_grad;
-}
-
-// for MPPI:
-
-template <typename scalar_t>
-__global__ void
-acceleration_loop_kernel(scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-                         scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-                         const scalar_t *u_acc, const scalar_t *start_position,
-                         const scalar_t *start_velocity,
-                         const scalar_t *start_acceleration,
-                         const scalar_t *traj_dt, const int batch_size,
-                         const int horizon, const int dof) {
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-
-  // read start state:
-  float u_arr[MAX_H], dt[MAX_H];
-  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
-  const int b_addrs = b_idx * horizon * dof;
-
-#pragma unroll
-  for (int i = 0; i < horizon; i++) {
-    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
-    dt[i] = traj_dt[i];
-    out_pos[i] = 0;
-    out_vel[i] = 0;
-    out_acc[i] = 0;
-    out_jerk[i] = 0;
-  }
-
-  out_pos[0] = start_position[b_idx * dof + d_idx];
-  out_vel[0] = start_velocity[b_idx * dof + d_idx];
-  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
-  out_jerk[0] = 0.0;
-
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-    // do semi implicit euler integration:
-    out_acc[h_idx] = u_arr[h_idx - 1];
-    out_vel[h_idx] = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
-    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
-    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_pos[h_idx]; // new_position;
-    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_vel[h_idx];
-    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_acc[h_idx];
-    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
-  }
-}
-template <typename scalar_t>
-__global__ void acceleration_loop_rk2_kernel(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_acc, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof) {
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-
-  // read start state:
-  float u_arr[MAX_H], dt[MAX_H];
-  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
-  const int b_addrs = b_idx * horizon * dof;
-
-#pragma unroll
-  for (int i = 0; i < horizon; i++) {
-    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
-    dt[i] = traj_dt[i];
-    out_pos[i] = 0;
-    out_vel[i] = 0;
-    out_acc[i] = 0;
-    out_jerk[i] = 0;
-  }
-
-  out_pos[0] = start_position[b_idx * dof + d_idx];
-  out_vel[0] = start_velocity[b_idx * dof + d_idx];
-  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
-  out_jerk[0] = 0.0;
-
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-    // do rk2 integration:
-
-    out_acc[h_idx] = u_arr[h_idx - 1];
-    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
-    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx - 1] * dt[h_idx] +
-                     0.5 * dt[h_idx] * dt[h_idx] * out_acc[h_idx];
-    out_vel[h_idx] = out_vel[h_idx - 1] + 0.5 * dt[h_idx] * out_acc[h_idx];
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_pos[h_idx]; // new_position;
-    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_vel[h_idx];
-    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_acc[h_idx];
-    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
-  }
-}
-
-template <typename scalar_t>
-__global__ void acceleration_loop_idx_kernel(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_acc, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-
-  // read start state:
-  float u_arr[MAX_H], dt[MAX_H];
-  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
-  const int b_addrs = b_idx * horizon * dof;
-  const int b_offset = start_idx[b_idx];
-
-#pragma unroll
-  for (int i = 0; i < horizon; i++) {
-    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
-    dt[i] = traj_dt[i];
-    out_pos[i] = 0;
-    out_vel[i] = 0;
-    out_acc[i] = 0;
-    out_jerk[i] = 0;
-  }
-
-  out_pos[0] = start_position[b_offset * dof + d_idx];
-  out_vel[0] = start_velocity[b_offset * dof + d_idx];
-  out_acc[0] = start_acceleration[b_offset * dof + d_idx];
-  out_jerk[0] = 0.0;
-
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-    // do semi implicit euler integration:
-    out_acc[h_idx] = u_arr[h_idx - 1];
-    out_vel[h_idx] = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
-    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
-    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_pos[h_idx]; // new_position;
-    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_vel[h_idx];
-    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_acc[h_idx];
-    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
-  }
-}
-
-template <typename scalar_t>
-__global__ void acceleration_loop_idx_rk2_kernel(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_acc, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-
-  // read start state:
-  float u_arr[MAX_H], dt[MAX_H];
-  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
-  const int b_addrs = b_idx * horizon * dof;
-  const int b_offset = start_idx[b_idx];
-
-#pragma unroll
-  for (int i = 0; i < horizon; i++) {
-    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
-    dt[i] = traj_dt[i];
-    out_pos[i] = 0;
-    out_vel[i] = 0;
-    out_acc[i] = 0;
-    out_jerk[i] = 0;
-  }
-
-  out_pos[0] = start_position[b_offset * dof + d_idx];
-  out_vel[0] = start_velocity[b_offset * dof + d_idx];
-  out_acc[0] = start_acceleration[b_offset * dof + d_idx];
-  out_jerk[0] = 0.0;
-
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-    // do semi implicit euler integration:
-
-    out_acc[h_idx] = u_arr[h_idx - 1];
-    out_vel[h_idx] = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
-    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
-    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_pos[h_idx]; // new_position;
-    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_vel[h_idx];
-    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
-        out_acc[h_idx];
-    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
-  }
-}
-
-// Not used
-
-template <typename scalar_t>
-__global__ void position_clique_kernel(
-    scalar_t *out_position, scalar_t *out_velocity, scalar_t *out_acceleration,
-    scalar_t *out_jerk, const scalar_t *u_position,
-    const scalar_t *start_position, const scalar_t *start_velocity,
-    const scalar_t *start_acceleration, const scalar_t *traj_dt,
-    const int batch_size, const int horizon, const int dof) {
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (horizon * dof);
-  const int h_idx = (tid - b_idx * (horizon * dof)) / dof;
-  const int d_idx = (tid - b_idx * horizon * dof - h_idx * dof);
-  if (b_idx >= batch_size || h_idx >= horizon || d_idx >= dof) {
-    return;
-  }
-
-  float new_position, new_velocity, new_acceleration, new_jerk;
-  const float dt = traj_dt[h_idx];
-
-  // read actions: batch, horizon
-  if (h_idx == 0) {
-    new_position = start_position[b_idx * dof + d_idx];
-    new_velocity = start_velocity[b_idx * dof + d_idx];
-    new_acceleration = start_acceleration[b_idx * dof + d_idx];
-    new_jerk = 0.0;
-  } else if (h_idx == 1) {
-
-    float2 u_clique = make_float2(
-        start_position[b_idx * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
-    new_position = u_clique.y;
-    new_velocity = (u_clique.y - u_clique.x) * dt; // 1 - 0
-    new_acceleration =
-        (u_clique.y - u_clique.x) * dt * dt;        // 2 - 2.0 * 1 + 1
-    new_jerk = (u_clique.y - u_clique.x) *  dt * dt * dt; // -1 3 -3 1
-  } else if (h_idx == 2) {
-    float3 u_clique = make_float3(
-        start_position[b_idx * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
-
-    new_position = u_clique.z;
-    new_velocity = (u_clique.z - u_clique.y) * dt; // 1 - 0
-    new_acceleration = (u_clique.x - 2 * u_clique.y + u_clique.z) *
-                       dt * dt; // 2 - 2.0 * 1 + 1
-    new_jerk = (2 * u_clique.x - 3 * u_clique.y + u_clique.z) *
-               dt * dt * dt; // -1 3 -3 1
-
-  } else if (h_idx == 3) {
-    float4 u_clique = make_float4(
-        start_position[b_idx * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 3) * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
-    new_position = u_clique.w;
-    new_velocity = (u_clique.w - u_clique.z) * dt; // 1 - 0
-    new_acceleration = (u_clique.y - 2 * u_clique.z + u_clique.w) *
-                       dt * dt; // 2 - 2.0 * 1 + 1
-    new_jerk =
-        (-1.0 * u_clique.x + 3 * u_clique.y - 3 * u_clique.z + u_clique.w) *
-        dt * dt * dt;
-  } else {
-    float4 u_clique = make_float4(
-        u_position[b_idx * horizon * dof + (h_idx - 4) * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 3) * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
-        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
-    new_position = u_clique.w;
-    new_velocity = (u_clique.w - u_clique.z) * dt; // 1 - 0
-    new_acceleration = (u_clique.y - 2 * u_clique.z + u_clique.w) *
-                       dt * dt; // 2 - 2.0 * 1 + 1
-    new_jerk =
-        (-1.0 * u_clique.x + 3 * u_clique.y - 3 * u_clique.z + u_clique.w) *
-        dt * dt * dt;
-  }
-
-  // h_idx = h_idx + 1;
-  out_position[b_idx * horizon * dof + h_idx * dof + d_idx] =
-      new_position; // new_position;
-  out_velocity[b_idx * horizon * dof + h_idx * dof + d_idx] = new_velocity;
-  out_acceleration[b_idx * horizon * dof + h_idx * dof + d_idx] =
-      new_acceleration;
-  out_jerk[b_idx * horizon * dof + h_idx * dof + d_idx] = new_jerk;
-}
-
-// Not used
-template <typename scalar_t>
-__global__ void position_clique_loop_coalesce_kernel(
-    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
-    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
-    const scalar_t *u_position, const scalar_t *start_position,
-    const scalar_t *start_velocity, const scalar_t *start_acceleration,
-    const scalar_t *traj_dt, const int batch_size, const int horizon,
-    const int dof) {
-  // data is stored as batch, dof, horizon
-  // there are batch * horizon * dof threads:
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-
-  const float dt =
-      traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
-
-  // read start state:
-  float u_arr[MAX_H];
-  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
-  const int b_addrs = b_idx * horizon * dof;
-
-#pragma unroll
-  for (int i = 0; i < horizon; i++) {
-    u_arr[i] = u_position[b_addrs + d_idx * horizon + i];
-    out_pos[i] = 0;
-    out_vel[i] = 0;
-    out_acc[i] = 0;
-    out_jerk[i] = 0;
-  }
-
-  out_pos[0] = start_position[b_idx * dof + d_idx];
-  out_vel[0] = start_velocity[b_idx * dof + d_idx];
-  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
-  out_jerk[0] = 0.0;
-
-  for (int h_idx = 1; h_idx < horizon; h_idx++) {
-
-    // read actions: batch, horizon
-
-    out_pos[h_idx] = u_arr[h_idx - 1];
-
-    out_vel[h_idx] = (out_pos[h_idx] - out_pos[h_idx - 1]) * dt; // 1 - 0
-    out_acc[h_idx] =
-        (out_vel[h_idx] - out_vel[h_idx - 1]) * dt; // 2 - 2.0 * 1 + 1
-    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) * dt; // -1 3 -3 1
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    out_position_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
-        out_pos[h_idx]; // new_position;
-    out_velocity_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
-        out_vel[h_idx];
-    out_acceleration_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
-        out_acc[h_idx];
-    out_jerk_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
-        out_jerk[h_idx];
-  }
-}
-
-// Not used
-template <typename scalar_t>
-__global__ void backward_position_clique_loop_coalesce_kernel(
-    scalar_t *out_grad_position, const scalar_t *grad_position,
-    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
-    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (dof);
-  const int d_idx = (tid - b_idx * dof);
-  if (b_idx >= batch_size || d_idx >= dof) {
-    return;
-  }
-  const int b_addrs = b_idx * horizon * dof;
-  // read gradients:
-  float g_pos[MAX_H];
-  float g_vel[MAX_H];
-  float g_acc[MAX_H];
-  float g_jerk[MAX_H];
-  const float dt = traj_dt[0];
-  const float dt_2 = dt * dt;
-  const float dt_3 = dt * dt * dt;
-#pragma unroll
-  for (int h_idx = 0; h_idx < horizon; h_idx++) {
-    g_pos[h_idx] = grad_position[b_addrs + d_idx * horizon + h_idx];
-    g_vel[h_idx] = grad_velocity[b_addrs + d_idx * horizon + h_idx];
-    g_acc[h_idx] = grad_acceleration[b_addrs + d_idx * horizon + h_idx];
-    g_jerk[h_idx] = grad_jerk[b_addrs + d_idx * horizon + h_idx];
-  }
-#pragma unroll
-  for (int i = 0; i < 4; i++) {
-    g_vel[horizon + i] = 0.0;
-    g_acc[horizon + i] = 0.0;
-    g_jerk[horizon + i] = 0.0;
-  }
-  // compute gradient and sum
-  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
-
-    g_pos[h_idx + 1] +=
-        ((g_vel[h_idx + 1] - g_vel[h_idx + 2]) * dt +
-         (g_acc[h_idx + 1] - 2 * g_acc[h_idx + 2] + g_acc[h_idx + 3]) * dt_2 +
-         (1 * g_jerk[h_idx + 1] - 3 * g_jerk[h_idx + 2] +
-          3 * g_jerk[h_idx + 3] - g_jerk[h_idx + 4]) *
-             dt_3);
-  }
-
-  // write out:
-  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
-    out_grad_position[b_addrs + d_idx * horizon + h_idx] = g_pos[h_idx + 1];
-  }
-  out_grad_position[b_addrs + d_idx * horizon + horizon - 1] = 0.0;
-}
-
-// Not used
-template <typename scalar_t>
-__global__ void backward_position_clique_kernel(
-    scalar_t *out_grad_position, const scalar_t *grad_position,
-    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
-    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
-    const int horizon, const int dof) {
-  // TODO: transpose h and dof to be able to directly read float2, float3, etc..
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int b_idx = tid / (horizon * dof);
-  const int h_idx = (tid - b_idx * (horizon * dof)) / dof;
-  const int d_idx = (tid - b_idx * horizon * dof - h_idx * dof);
-  if (b_idx >= batch_size || h_idx >= horizon || d_idx >= dof) {
-    return;
-  }
-  const int b_addrs = b_idx * horizon * dof;
-  if (h_idx == horizon - 1) {
-    out_grad_position[b_addrs + (h_idx)*dof + d_idx] = 0.0;
-    return;
-  }
-  // read gradients:
-  const float dt = traj_dt[0];
-  float g_u = grad_position[b_addrs + (h_idx + 1) * dof + d_idx];
-
-  float2 g_vel;
-  float3 g_acc;
-  float4 g_jerk;
-
-  if (h_idx < horizon - 4) // && h_idx > 0)
-  {
-    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
-    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
-                        grad_acceleration[b_addrs + (h_idx + 3) * dof + d_idx]);
-
-    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 4) * dof + d_idx]);
-    g_u +=
-        ((g_vel.x - g_vel.y) * dt +
-         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
-         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
-  } else if (h_idx == 0) {
-    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
-
-    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
-                        0.0);
-
-    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx], 0.0);
-    g_u += ((g_vel.x - g_vel.y) * dt +
-            (-1.0 * g_acc.x + 1 * g_acc.y) * dt * dt +
-            (-1 * g_jerk.x + 2 * g_jerk.y - 1 * g_jerk.z) * dt * dt * dt);
-  } else if (h_idx == horizon - 4) {
-    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
-    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
-                        grad_acceleration[b_addrs + (h_idx + 3) * dof + d_idx]);
-    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
-                         grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx], 0.0);
-    g_u +=
-        ((g_vel.x - g_vel.y) * dt +
-         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
-         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
-  } else if (h_idx == horizon - 3) {
-
-    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
-                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
-    g_acc =
-        make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
-                    grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx], 0);
-    g_jerk =
-        make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
-                    grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx], 0.0, 0.0);
-    g_u +=
-        ((g_vel.x - g_vel.y) * dt +
-         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
-         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
-  } else if (h_idx == horizon - 2) {
-
-    g_vel =
-        make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx], 0.0);
-    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
-                        0, 0);
-    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx], 0.0,
-                         0.0, 0.0);
-    g_u +=
-        ((g_vel.x - g_vel.y) * dt +
-         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
-         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
-  }
-
-  out_grad_position[b_addrs + (h_idx)*dof + d_idx] = g_u;
-}
-
-} // namespace
-}
-std::vector<torch::Tensor> step_position_clique(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof) {
   using namespace Curobo::TensorStep;
   assert(horizon < MAX_H);
 
-  const int k_size = batch_size * dof;
+  const int k_size    = batch_size * dof;
   int threadsPerBlock = k_size;
-  if (threadsPerBlock > 512) {
+
+  if (threadsPerBlock > 512)
+  {
     threadsPerBlock = 512;
   }
 
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      out_position.scalar_type(), "step_position_clique", ([&] {
-        position_clique_loop_kernel<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_position.data_ptr<scalar_t>(),
-                out_velocity.data_ptr<scalar_t>(),
-                out_acceleration.data_ptr<scalar_t>(),
-                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
-                start_position.data_ptr<scalar_t>(),
-                start_velocity.data_ptr<scalar_t>(),
-                start_acceleration.data_ptr<scalar_t>(),
-                traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
-      }));
+    out_position.scalar_type(), "step_position_clique", ([&] {
+    position_clique_loop_kernel<scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+      out_position.data_ptr<scalar_t>(),
+      out_velocity.data_ptr<scalar_t>(),
+      out_acceleration.data_ptr<scalar_t>(),
+      out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+      start_position.data_ptr<scalar_t>(),
+      start_velocity.data_ptr<scalar_t>(),
+      start_acceleration.data_ptr<scalar_t>(),
+      traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+  }));
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_position, out_velocity, out_acceleration, out_jerk};
+  return { out_position, out_velocity, out_acceleration, out_jerk };
 }
 
-std::vector<torch::Tensor> step_position_clique2(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof,
-    const int mode = -1) {
+std::vector<torch::Tensor>step_position_clique2(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof,
+  const int mode = -1)
+{
   using namespace Curobo::TensorStep;
 
   assert(horizon > 5);
-  //assert(horizon < MAX_H);
-  const int k_size = batch_size * dof * horizon;
+
+  // assert(horizon < MAX_H);
+  const int k_size    = batch_size * dof * horizon;
   int threadsPerBlock = k_size;
-  if (threadsPerBlock > 128) {
+
+  if (threadsPerBlock > 128)
+  {
     threadsPerBlock = 128;
   }
 
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   if (mode == BWD_DIFF)
   {
-
-  
-  AT_DISPATCH_FLOATING_TYPES(
+    AT_DISPATCH_FLOATING_TYPES(
       out_position.scalar_type(), "step_position_clique", ([&] {
-        position_clique_loop_kernel2<scalar_t, BWD_DIFF>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_position.data_ptr<scalar_t>(),
-                out_velocity.data_ptr<scalar_t>(),
-                out_acceleration.data_ptr<scalar_t>(),
-                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
-                start_position.data_ptr<scalar_t>(),
-                start_velocity.data_ptr<scalar_t>(),
-                start_acceleration.data_ptr<scalar_t>(),
-                traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
-      }));
+      position_clique_loop_kernel2<scalar_t, BWD_DIFF>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+    }));
   }
   else if (mode == CENTRAL_DIFF)
   {
-      assert(u_position.sizes()[1] == horizon - 4);
+    assert(u_position.sizes()[1] == horizon - 4);
 
-     AT_DISPATCH_FLOATING_TYPES(
+    AT_DISPATCH_FLOATING_TYPES(
       out_position.scalar_type(), "step_position_clique", ([&] {
-        position_clique_loop_kernel2<scalar_t, CENTRAL_DIFF>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_position.data_ptr<scalar_t>(),
-                out_velocity.data_ptr<scalar_t>(),
-                out_acceleration.data_ptr<scalar_t>(),
-                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
-                start_position.data_ptr<scalar_t>(),
-                start_velocity.data_ptr<scalar_t>(),
-                start_acceleration.data_ptr<scalar_t>(),
-                traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
-      }));
-  }
-  else 
-  {
-    assert (false);
-  }
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return {out_position, out_velocity, out_acceleration, out_jerk};
-}
-std::vector<torch::Tensor> step_position_clique2_idx(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const int mode = -1) {
-    using namespace Curobo::TensorStep;
-
-  //assert(horizon < MAX_H);
-  assert(horizon > 5);
-
-
-  const int k_size = batch_size * dof * horizon;
-  int threadsPerBlock = k_size;
-  if (threadsPerBlock > 128) {
-    threadsPerBlock = 128;
-  }
-
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (mode == BWD_DIFF)
-  {
-
-  assert(false);
-  AT_DISPATCH_FLOATING_TYPES(
-      out_position.scalar_type(), "step_position_clique", ([&] {
-        position_clique_loop_idx_kernel2<scalar_t, BWD_DIFF>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_position.data_ptr<scalar_t>(),
-                out_velocity.data_ptr<scalar_t>(),
-                out_acceleration.data_ptr<scalar_t>(),
-                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
-                start_position.data_ptr<scalar_t>(),
-                start_velocity.data_ptr<scalar_t>(),
-                start_acceleration.data_ptr<scalar_t>(),
-                start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
-                batch_size, horizon, dof);
-      }));
-  }
- 
-  else if (mode == CENTRAL_DIFF)
-  {  
-  assert(u_position.sizes()[1] == horizon - 4);
-
-  AT_DISPATCH_FLOATING_TYPES(
-      out_position.scalar_type(), "step_position_clique", ([&] {
-        position_clique_loop_idx_kernel2<scalar_t, CENTRAL_DIFF>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_position.data_ptr<scalar_t>(),
-                out_velocity.data_ptr<scalar_t>(),
-                out_acceleration.data_ptr<scalar_t>(),
-                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
-                start_position.data_ptr<scalar_t>(),
-                start_velocity.data_ptr<scalar_t>(),
-                start_acceleration.data_ptr<scalar_t>(),
-                start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
-                batch_size, horizon, dof);
-      }));
+      position_clique_loop_kernel2<scalar_t, CENTRAL_DIFF>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+    }));
   }
   else
   {
@@ -1504,96 +1601,173 @@ std::vector<torch::Tensor> step_position_clique2_idx(
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_position, out_velocity, out_acceleration, out_jerk};
+  return { out_position, out_velocity, out_acceleration, out_jerk };
 }
 
-std::vector<torch::Tensor> backward_step_position_clique(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof) 
+std::vector<torch::Tensor>step_position_clique2_idx(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const int mode = -1)
+{
+  using namespace Curobo::TensorStep;
+
+  // assert(horizon < MAX_H);
+  assert(horizon > 5);
+
+
+  const int k_size    = batch_size * dof * horizon;
+  int threadsPerBlock = k_size;
+
+  if (threadsPerBlock > 128)
+  {
+    threadsPerBlock = 128;
+  }
+
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (mode == BWD_DIFF)
+  {
+    assert(false);
+    AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+      position_clique_loop_idx_kernel2<scalar_t, BWD_DIFF>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+        batch_size, horizon, dof);
+    }));
+  }
+
+  else if (mode == CENTRAL_DIFF)
+  {
+    assert(u_position.sizes()[1] == horizon - 4);
+
+    AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+      position_clique_loop_idx_kernel2<scalar_t, CENTRAL_DIFF>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+        batch_size, horizon, dof);
+    }));
+  }
+  else
+  {
+    assert(false);
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return { out_position, out_velocity, out_acceleration, out_jerk };
+}
+
+std::vector<torch::Tensor>backward_step_position_clique(
+  torch::Tensor out_grad_position, const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof)
 {
   using namespace Curobo::TensorStep;
 
   assert(horizon < MAX_H - 4);
-  const int k_size = batch_size * dof;
+  const int k_size    = batch_size * dof;
   int threadsPerBlock = k_size;
-  if (threadsPerBlock > 128) {
+
+  if (threadsPerBlock > 128)
+  {
     threadsPerBlock = 128;
   }
 
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
-        backward_position_clique_loop_kernel<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_grad_position.data_ptr<scalar_t>(),
-                grad_position.data_ptr<scalar_t>(),
-                grad_velocity.data_ptr<scalar_t>(),
-                grad_acceleration.data_ptr<scalar_t>(),
-                grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
-                batch_size, horizon, dof);
-      }));
+    out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
+    backward_position_clique_loop_kernel<scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+      out_grad_position.data_ptr<scalar_t>(),
+      grad_position.data_ptr<scalar_t>(),
+      grad_velocity.data_ptr<scalar_t>(),
+      grad_acceleration.data_ptr<scalar_t>(),
+      grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
+      batch_size, horizon, dof);
+  }));
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_grad_position};
+  return { out_grad_position };
 }
 
-std::vector<torch::Tensor> backward_step_position_clique2(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const int mode = -1) {
-  //assert(horizon < MAX_H - 4);
+std::vector<torch::Tensor>backward_step_position_clique2(
+  torch::Tensor out_grad_position, const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const int mode = -1)
+{
+  // assert(horizon < MAX_H - 4);
   using namespace Curobo::TensorStep;
 
   assert(horizon > 5);
 
 
   // const int k_size = batch_size * dof;
-  const int k_size = batch_size * dof * horizon;
+  const int k_size    = batch_size * dof * horizon;
   int threadsPerBlock = k_size;
-  if (threadsPerBlock > 128) {
+
+  if (threadsPerBlock > 128)
+  {
     threadsPerBlock = 128;
   }
 
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   if (mode == BWD_DIFF)
   {
-
-  assert(false); // not supported anymore
-  AT_DISPATCH_FLOATING_TYPES(
+    assert(false); // not supported anymore
+    AT_DISPATCH_FLOATING_TYPES(
       out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
-        backward_position_clique_loop_backward_difference_kernel2<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_grad_position.data_ptr<scalar_t>(),
-                grad_position.data_ptr<scalar_t>(),
-                grad_velocity.data_ptr<scalar_t>(),
-                grad_acceleration.data_ptr<scalar_t>(),
-                grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
-                batch_size, horizon, dof);
-      }));
-
+      backward_position_clique_loop_backward_difference_kernel2<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_grad_position.data_ptr<scalar_t>(),
+        grad_position.data_ptr<scalar_t>(),
+        grad_velocity.data_ptr<scalar_t>(),
+        grad_acceleration.data_ptr<scalar_t>(),
+        grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
+        batch_size, horizon, dof);
+    }));
   }
   else if (mode == CENTRAL_DIFF)
   {
-  assert(out_grad_position.sizes()[1] == horizon - 4);
-  AT_DISPATCH_FLOATING_TYPES(
+    assert(out_grad_position.sizes()[1] == horizon - 4);
+    AT_DISPATCH_FLOATING_TYPES(
       out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
-        backward_position_clique_loop_central_difference_kernel2<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                out_grad_position.data_ptr<scalar_t>(),
-                grad_position.data_ptr<scalar_t>(),
-                grad_velocity.data_ptr<scalar_t>(),
-                grad_acceleration.data_ptr<scalar_t>(),
-                grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
-                batch_size, horizon, dof);
-      }));
+      backward_position_clique_loop_central_difference_kernel2<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_grad_position.data_ptr<scalar_t>(),
+        grad_position.data_ptr<scalar_t>(),
+        grad_velocity.data_ptr<scalar_t>(),
+        grad_acceleration.data_ptr<scalar_t>(),
+        grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
+        batch_size, horizon, dof);
+    }));
   }
   else
   {
@@ -1602,7 +1776,7 @@ std::vector<torch::Tensor> backward_step_position_clique2(
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_grad_position};
+  return { out_grad_position };
 }
 
 std::vector<torch::Tensor>
@@ -1612,109 +1786,119 @@ step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
                   const torch::Tensor start_velocity,
                   const torch::Tensor start_acceleration,
                   const torch::Tensor traj_dt, const int batch_size,
-                  const int horizon, const int dof, const bool use_rk2 = true) {
+                  const int horizon, const int dof, const bool use_rk2 = true)
+{
   assert(horizon < MAX_H);
   using namespace Curobo::TensorStep;
 
-  const int k_size = batch_size * dof;
+  const int k_size    = batch_size * dof;
   int threadsPerBlock = k_size;
-  if (threadsPerBlock > 512) {
+
+  if (threadsPerBlock > 512)
+  {
     threadsPerBlock = 512;
   }
 
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (use_rk2) {
 
+  if (use_rk2)
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        out_position.scalar_type(), "step_acceleration", ([&] {
-          acceleration_loop_rk2_kernel<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_position.data_ptr<scalar_t>(),
-                  out_velocity.data_ptr<scalar_t>(),
-                  out_acceleration.data_ptr<scalar_t>(),
-                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
-                  start_position.data_ptr<scalar_t>(),
-                  start_velocity.data_ptr<scalar_t>(),
-                  start_acceleration.data_ptr<scalar_t>(),
-                  traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
-        }));
-
+      out_position.scalar_type(), "step_acceleration", ([&] {
+      acceleration_loop_rk2_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+    }));
   }
 
-  else {
+  else
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        out_position.scalar_type(), "step_acceleration", ([&] {
-          acceleration_loop_kernel<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_position.data_ptr<scalar_t>(),
-                  out_velocity.data_ptr<scalar_t>(),
-                  out_acceleration.data_ptr<scalar_t>(),
-                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
-                  start_position.data_ptr<scalar_t>(),
-                  start_velocity.data_ptr<scalar_t>(),
-                  start_acceleration.data_ptr<scalar_t>(),
-                  traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
-        }));
+      out_position.scalar_type(), "step_acceleration", ([&] {
+      acceleration_loop_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+    }));
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_position, out_velocity, out_acceleration, out_jerk};
+  return { out_position, out_velocity, out_acceleration, out_jerk };
 }
 
-std::vector<torch::Tensor> step_acceleration_idx(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const bool use_rk2 = true) {
+std::vector<torch::Tensor>step_acceleration_idx(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_acc, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const bool use_rk2 = true)
+{
   assert(horizon < MAX_H);
   using namespace Curobo::TensorStep;
 
 
-  const int k_size = batch_size * dof;
+  const int k_size    = batch_size * dof;
   int threadsPerBlock = k_size;
-  if (threadsPerBlock > 512) {
+
+  if (threadsPerBlock > 512)
+  {
     threadsPerBlock = 512;
   }
 
-  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  int blocksPerGrid   = (k_size + threadsPerBlock - 1) / threadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (use_rk2) {
 
+  if (use_rk2)
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        out_position.scalar_type(), "step_acceleration", ([&] {
-          acceleration_loop_idx_rk2_kernel<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_position.data_ptr<scalar_t>(),
-                  out_velocity.data_ptr<scalar_t>(),
-                  out_acceleration.data_ptr<scalar_t>(),
-                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
-                  start_position.data_ptr<scalar_t>(),
-                  start_velocity.data_ptr<scalar_t>(),
-                  start_acceleration.data_ptr<scalar_t>(),
-                  start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
-                  batch_size, horizon, dof);
-        }));
-  } else {
+      out_position.scalar_type(), "step_acceleration", ([&] {
+      acceleration_loop_idx_rk2_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+        batch_size, horizon, dof);
+    }));
+  }
+  else
+  {
     AT_DISPATCH_FLOATING_TYPES(
-        out_position.scalar_type(), "step_acceleration", ([&] {
-          acceleration_loop_idx_kernel<scalar_t>
-              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                  out_position.data_ptr<scalar_t>(),
-                  out_velocity.data_ptr<scalar_t>(),
-                  out_acceleration.data_ptr<scalar_t>(),
-                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
-                  start_position.data_ptr<scalar_t>(),
-                  start_velocity.data_ptr<scalar_t>(),
-                  start_acceleration.data_ptr<scalar_t>(),
-                  start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
-                  batch_size, horizon, dof);
-        }));
+      out_position.scalar_type(), "step_acceleration", ([&] {
+      acceleration_loop_idx_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+        out_position.data_ptr<scalar_t>(),
+        out_velocity.data_ptr<scalar_t>(),
+        out_acceleration.data_ptr<scalar_t>(),
+        out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+        start_position.data_ptr<scalar_t>(),
+        start_velocity.data_ptr<scalar_t>(),
+        start_acceleration.data_ptr<scalar_t>(),
+        start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+        batch_size, horizon, dof);
+    }));
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {out_position, out_velocity, out_acceleration, out_jerk};
+  return { out_position, out_velocity, out_acceleration, out_jerk };
 }
diff --git a/src/curobo/curobolib/cpp/update_best_kernel.cu b/src/curobo/curobolib/cpp/update_best_kernel.cu
index c96eaf7..c6715da 100644
--- a/src/curobo/curobolib/cpp/update_best_kernel.cu
+++ b/src/curobo/curobolib/cpp/update_best_kernel.cu
@@ -26,101 +26,108 @@
 #include <cub/cub.cuh>
 #include <math.h>
 
-namespace Curobo {
-
-namespace Optimization {
-
-// We launch with d_opt*cost_s1 threads.
-// We assume that cost_s2 is always 1.
-template <typename scalar_t>
-__global__ void update_best_kernel(scalar_t *best_cost,     // 200x1
-                                   scalar_t *best_q,        // 200x7
-                                   int16_t *best_iteration, // 200 x 1
-                                   int16_t *current_iteration, // 1
-                                   const scalar_t *cost,    // 200x1
-                                   const scalar_t *q,       // 200x7
-                                   const int d_opt,         // 7
-                                   const int cost_s1,       // 200
-                                   const int cost_s2, 
-                                   const int iteration,
-                                   const float delta_threshold,
-                                   const float relative_threshold) // 1
+namespace Curobo
 {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int size = cost_s1 * d_opt; // size of best_q
-  if (tid >= size) {
-    return;
-  }
-
-  const int cost_idx = tid / d_opt;
-  const float cost_new = cost[cost_idx];
-  const float best_cost_in = best_cost[cost_idx];
-  const bool change = (best_cost_in - cost_new) > delta_threshold && cost_new < best_cost_in * relative_threshold;
-  if (change) {
-    best_q[tid] = q[tid]; // update best_q
-
-    if (tid % d_opt == 0) {
-      best_cost[cost_idx] = cost_new ; // update best_cost
-      //best_iteration[cost_idx] = curr_iter + iteration; // 
-             // this tensor keeps track of whether the cost reduced by at least
-             // delta_threshold.
-             // here iteration is the last_best parameter. 
-    }
-  }
-
-  if (tid % d_opt == 0)
+  namespace Optimization
   {
-    if (change)
+    // We launch with d_opt*cost_s1 threads.
+    // We assume that cost_s2 is always 1.
+    template<typename scalar_t>
+    __global__ void update_best_kernel(scalar_t       *best_cost,         // 200x1
+                                       scalar_t       *best_q,            // 200x7
+                                       int16_t        *best_iteration,    // 200 x 1
+                                       int16_t        *current_iteration, // 1
+                                       const scalar_t *cost,              // 200x1
+                                       const scalar_t *q,                 // 200x7
+                                       const int       d_opt,             // 7
+                                       const int       cost_s1,           // 200
+                                       const int       cost_s2,
+                                       const int       iteration,
+                                       const float     delta_threshold,
+                                       const float     relative_threshold) // 1
     {
-      best_iteration[cost_idx] = 0;
+      int tid  = blockIdx.x * blockDim.x + threadIdx.x;
+      int size = cost_s1 * d_opt;                                          // size of best_q
+
+      if (tid >= size)
+      {
+        return;
+      }
+
+      const int   cost_idx     = tid / d_opt;
+      const float cost_new     = cost[cost_idx];
+      const float best_cost_in = best_cost[cost_idx];
+      const bool  change       = (best_cost_in - cost_new) > delta_threshold &&
+                                 cost_new < best_cost_in * relative_threshold;
+
+      if (change)
+      {
+        best_q[tid] = q[tid]; // update best_q
+
+        if (tid % d_opt == 0)
+        {
+          best_cost[cost_idx] = cost_new; // update best_cost
+          // best_iteration[cost_idx] = curr_iter + iteration; //
+          // this tensor keeps track of whether the cost reduced by at least
+          // delta_threshold.
+          // here iteration is the last_best parameter.
+        }
+      }
+
+      if (tid % d_opt == 0)
+      {
+        if (change)
+        {
+          best_iteration[cost_idx] = 0;
+        }
+        else
+        {
+          best_iteration[cost_idx] -= 1;
+        }
+      }
+
+      // .if (tid == 0)
+      // {
+      //  curr_iter += 1;
+      //  current_iteration[0] = curr_iter;
+      // }
     }
-    else
-    {
-      best_iteration[cost_idx] -= 1;
-    }
-  }
- 
-  //.if (tid == 0)
-  //{
-  //  curr_iter += 1;
-  //  current_iteration[0] = curr_iter;
-  //}
-  
-} 
-} // namespace Optimization
-} // namespace Curobo
+  } // namespace Optimization
+}   // namespace Curobo
 
 std::vector<torch::Tensor>
 update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
                  torch::Tensor best_iteration,
                  torch::Tensor current_iteration,
-                  const torch::Tensor cost,
+                 const torch::Tensor cost,
                  const torch::Tensor q, const int d_opt, const int cost_s1,
                  const int cost_s2, const int iteration,
                  const float delta_threshold,
-                 const float relative_threshold = 0.999) {
+                 const float relative_threshold = 0.999)
+{
   using namespace Curobo::Optimization;
   const int threadsPerBlock = 128;
-  const int cost_size = cost_s1 * d_opt;
+  const int cost_size       = cost_s1 * d_opt;
   assert(cost_s2 == 1); // assumption
   const int blocksPerGrid = (cost_size + threadsPerBlock - 1) / threadsPerBlock;
+
   // printf("cost_s1=%d, d_opt=%d, blocksPerGrid=%d\n", cost_s1, d_opt,
   // blocksPerGrid);
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_FLOATING_TYPES(
-      cost.scalar_type(), "update_best_cu", ([&] {
-        update_best_kernel<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
-                best_iteration.data_ptr<int16_t>(), 
-                current_iteration.data_ptr<int16_t>(), 
-                cost.data_ptr<scalar_t>(),
-                q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
-                delta_threshold, relative_threshold);
-      }));
+    cost.scalar_type(), "update_best_cu", ([&] {
+    update_best_kernel<scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+      best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
+      best_iteration.data_ptr<int16_t>(),
+      current_iteration.data_ptr<int16_t>(),
+      cost.data_ptr<scalar_t>(),
+      q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
+      delta_threshold, relative_threshold);
+  }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  return {best_cost, best_q, best_iteration};
-}
\ No newline at end of file
+  return { best_cost, best_q, best_iteration };
+}
diff --git a/src/curobo/curobolib/geom.py b/src/curobo/curobolib/geom.py
index 1d5a9fe..fd1caf0 100644
--- a/src/curobo/curobolib/geom.py
+++ b/src/curobo/curobolib/geom.py
@@ -153,6 +153,8 @@ def get_pose_distance(
     vec_convergence,
     run_weight,
     run_vec_weight,
+    offset_waypoint,
+    offset_tstep_fraction,
     batch_pose_idx,
     batch_size,
     horizon,
@@ -161,6 +163,7 @@ def get_pose_distance(
     write_grad=False,
     write_distance=False,
     use_metric=False,
+    project_distance=True,
 ):
     if batch_pose_idx.shape[0] != batch_size:
         raise ValueError("Index buffer size is different from batch size")
@@ -181,6 +184,8 @@ def get_pose_distance(
         vec_convergence,
         run_weight,
         run_vec_weight,
+        offset_waypoint,
+        offset_tstep_fraction,
         batch_pose_idx,
         batch_size,
         horizon,
@@ -189,6 +194,7 @@ def get_pose_distance(
         write_grad,
         write_distance,
         use_metric,
+        project_distance,
     )
 
     out_distance = r[0]
@@ -229,6 +235,331 @@ def get_pose_distance_backward(
     return r[0], r[1]
 
 
+@torch.jit.script
+def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):
+    grad_vec = grad_g_dist + (grad_out_distance * weight)
+    grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec
+    return grad
+
+
+# full method:
+@torch.jit.script
+def backward_full_PoseError_jit(
+    grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
+):
+    p_grad = (grad_g_dist + (grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
+    q_grad = (grad_r_err + (grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
+    # p_grad = ((grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
+    # q_grad = ((grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
+
+    return p_grad, q_grad
+
+
+class PoseErrorDistance(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position,
+        goal_position,
+        current_quat,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        offset_waypoint,
+        offset_tstep_fraction,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode,  # =PoseErrorType.BATCH_GOAL.value,
+        num_goals,
+        use_metric,  # =False,
+        project_distance,  # =True,
+    ):
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            offset_waypoint,
+            offset_tstep_fraction,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            True,
+            use_metric,
+            project_distance,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec, weight, out_p_grad, out_q_grad)
+        return out_distance, out_position_distance, out_rotation_distance, out_idx  # .view(-1,1)
+
+    @staticmethod
+    def backward(ctx, grad_out_distance, grad_g_dist, grad_r_err, grad_out_idx):
+        (g_vec_p, g_vec_q, weight, out_grad_p, out_grad_q) = ctx.saved_tensors
+        pos_grad = None
+        quat_grad = None
+        batch_size = g_vec_p.shape[0] * g_vec_p.shape[1]
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            pos_grad, quat_grad = get_pose_distance_backward(
+                out_grad_p,
+                out_grad_q,
+                grad_out_distance.contiguous(),
+                grad_g_dist.contiguous(),
+                grad_r_err.contiguous(),
+                weight,
+                g_vec_p,
+                g_vec_q,
+                batch_size,
+                use_distance=True,
+            )
+
+        elif ctx.needs_input_grad[0]:
+            pos_grad = backward_PoseError_jit(grad_g_dist, grad_out_distance, weight[1], g_vec_p)
+
+        elif ctx.needs_input_grad[2]:
+            quat_grad = backward_PoseError_jit(grad_r_err, grad_out_distance, weight[0], g_vec_q)
+
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class PoseError(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position: torch.Tensor,
+        goal_position: torch.Tensor,
+        current_quat: torch.Tensor,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        offset_waypoint,
+        offset_tstep_fraction,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode,
+        num_goals,
+        use_metric,
+        project_distance,
+        return_loss,
+    ):
+        """Compute error in pose
+
+        _extended_summary_
+
+        Args:
+            ctx: _description_
+            current_position: _description_
+            goal_position: _description_
+            current_quat: _description_
+            goal_quat: _description_
+            vec_weight: _description_
+            weight: _description_
+            vec_convergence: _description_
+            run_weight: _description_
+            run_vec_weight: _description_
+            offset_waypoint: _description_
+            offset_tstep_fraction: _description_
+            batch_pose_idx: _description_
+            out_distance: _description_
+            out_position_distance: _description_
+            out_rotation_distance: _description_
+            out_p_vec: _description_
+            out_r_vec: _description_
+            out_idx: _description_
+            out_p_grad: _description_
+            out_q_grad: _description_
+            batch_size: _description_
+            horizon: _description_
+            mode: _description_
+            num_goals: _description_
+            use_metric: _description_
+            project_distance: _description_
+            return_loss: _description_
+
+        Returns:
+            _description_
+        """
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+        ctx.return_loss = return_loss
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            offset_waypoint,
+            offset_tstep_fraction,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            False,
+            use_metric,
+            project_distance,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec)
+        return out_distance
+
+    @staticmethod
+    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
+        pos_grad = None
+        quat_grad = None
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+            pos_grad = g_vec_p
+            quat_grad = g_vec_q
+            if ctx.return_loss:
+                pos_grad = pos_grad * grad_out_distance.unsqueeze(1)
+                quat_grad = quat_grad * grad_out_distance.unsqueeze(1)
+
+        elif ctx.needs_input_grad[0]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            pos_grad = g_vec_p
+            if ctx.return_loss:
+                pos_grad = pos_grad * grad_out_distance.unsqueeze(1)
+        elif ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            quat_grad = g_vec_q
+            if ctx.return_loss:
+                quat_grad = quat_grad * grad_out_distance.unsqueeze(1)
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
 class SdfSphereOBB(torch.autograd.Function):
     @staticmethod
     def forward(
diff --git a/src/curobo/geom/cv.py b/src/curobo/geom/cv.py
new file mode 100644
index 0000000..f2df669
--- /dev/null
+++ b/src/curobo/geom/cv.py
@@ -0,0 +1,118 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+
+@torch.jit.script
+def project_depth_to_pointcloud(depth_image: torch.Tensor, intrinsics_matrix: torch.Tensor):
+    """Projects numpy depth image to point cloud.
+
+    Args:
+      np_depth_image: numpy array float, shape (h, w).
+        intrinsics array: numpy array float, 3x3 intrinsics matrix.
+
+    Returns:
+      array of float (h, w, 3)
+    """
+    fx = intrinsics_matrix[0, 0]
+    fy = intrinsics_matrix[1, 1]
+    cx = intrinsics_matrix[0, 2]
+    cy = intrinsics_matrix[1, 2]
+    height = depth_image.shape[0]
+    width = depth_image.shape[1]
+    input_x = torch.arange(width, dtype=torch.float32, device=depth_image.device)
+    input_y = torch.arange(height, dtype=torch.float32, device=depth_image.device)
+    input_x, input_y = torch.meshgrid(input_x, input_y, indexing="ij")
+    input_x, input_y = input_x.T, input_y.T
+
+    input_z = depth_image
+    output_x = (input_x * input_z - cx * input_z) / fx
+    output_y = (input_y * input_z - cy * input_z) / fy
+    raw_pc = torch.stack([output_x, output_y, input_z], -1)
+
+    return raw_pc
+
+
+@torch.jit.script
+def get_projection_rays(height: int, width: int, intrinsics_matrix: torch.Tensor):
+    """Projects numpy depth image to point cloud.
+
+    Args:
+      np_depth_image: numpy array float, shape (h, w).
+        intrinsics array: numpy array float, 3x3 intrinsics matrix.
+
+    Returns:
+      array of float (h, w, 3)
+    """
+    fx = intrinsics_matrix[:, 0, 0]
+    fy = intrinsics_matrix[:, 1, 1]
+    cx = intrinsics_matrix[:, 0, 2]
+    cy = intrinsics_matrix[:, 1, 2]
+
+    input_x = torch.arange(width, dtype=torch.float32, device=intrinsics_matrix.device)
+    input_y = torch.arange(height, dtype=torch.float32, device=intrinsics_matrix.device)
+    input_x, input_y = torch.meshgrid(input_x, input_y, indexing="ij")
+
+    input_x, input_y = input_x.T, input_y.T
+
+    input_x = input_x.unsqueeze(0).repeat(intrinsics_matrix.shape[0], 1, 1)
+    input_y = input_y.unsqueeze(0).repeat(intrinsics_matrix.shape[0], 1, 1)
+
+    input_z = torch.ones(
+        (intrinsics_matrix.shape[0], height, width),
+        device=intrinsics_matrix.device,
+        dtype=torch.float32,
+    )
+
+    output_x = (input_x - cx) / fx
+    output_y = (input_y - cy) / fy
+
+    rays = torch.stack([output_x, output_y, input_z], -1).reshape(
+        intrinsics_matrix.shape[0], width * height, 3
+    )
+    rays = rays * (1.0 / 1000.0)
+    return rays
+
+
+@torch.jit.script
+def project_pointcloud_to_depth(
+    pointcloud: torch.Tensor,
+    output_image: torch.Tensor,
+):
+    """Projects pointcloud to depth image
+
+    Args:
+      np_depth_image: numpy array float, shape (h, w).
+        intrinsics array: numpy array float, 3x3 intrinsics matrix.
+
+    Returns:
+      array of float (h, w)
+    """
+    width, height = output_image.shape
+
+    output_image = output_image.view(-1)
+    output_image[:] = pointcloud[:, 2]
+    output_image = output_image.view(width, height)
+    return output_image
+
+
+@torch.jit.script
+def project_depth_using_rays(
+    depth_image: torch.Tensor, rays: torch.Tensor, filter_origin: bool = False
+):
+    if filter_origin:
+        depth_image = torch.where(depth_image < 0.01, 0, depth_image)
+
+    depth_image = depth_image.view(depth_image.shape[0], -1, 1).contiguous()
+    points = depth_image * rays
+    return points
diff --git a/src/curobo/geom/sdf/world.py b/src/curobo/geom/sdf/world.py
index ec3a632..ffa4981 100644
--- a/src/curobo/geom/sdf/world.py
+++ b/src/curobo/geom/sdf/world.py
@@ -364,8 +364,12 @@ class WorldPrimitiveCollision(WorldCollision):
         if self.cache is not None and "obb" in self.cache and self.cache["obb"] not in [None, 0]:
             self._create_obb_cache(self.cache["obb"])
 
-    def load_collision_model(self, world_config: WorldConfig, env_idx=0):
-        self._load_collision_model_in_cache(world_config, env_idx)
+    def load_collision_model(
+        self, world_config: WorldConfig, env_idx=0, fix_cache_reference: bool = False
+    ):
+        self._load_collision_model_in_cache(
+            world_config, env_idx, fix_cache_reference=fix_cache_reference
+        )
 
     def load_batch_collision_model(self, world_config_list: List[WorldConfig]):
         """Load a batch of collision environments from a list of world configs.
@@ -436,7 +440,9 @@ class WorldPrimitiveCollision(WorldCollision):
         )
         self.collision_types["primitive"] = True
 
-    def _load_collision_model_in_cache(self, world_config: WorldConfig, env_idx: int = 0):
+    def _load_collision_model_in_cache(
+        self, world_config: WorldConfig, env_idx: int = 0, fix_cache_reference: bool = False
+    ):
         cube_objs = world_config.cuboid
         max_obb = len(cube_objs)
         self.world_model = world_config
@@ -444,8 +450,11 @@ class WorldPrimitiveCollision(WorldCollision):
             log_info("No OBB objs")
             return
         if self._cube_tensor_list is None or self._cube_tensor_list[0].shape[1] < max_obb:
-            log_info("Creating Obb cache" + str(max_obb))
-            self._create_obb_cache(max_obb)
+            if not fix_cache_reference:
+                log_info("Creating Obb cache" + str(max_obb))
+                self._create_obb_cache(max_obb)
+            else:
+                log_error("number of OBB is larger than collision cache, create larger cache.")
 
         # load as a batch:
         pose_batch = [c.pose for c in cube_objs]
@@ -835,7 +844,9 @@ class WorldPrimitiveCollision(WorldCollision):
         return dist
 
     def clear_cache(self):
-        pass
+        if self._cube_tensor_list is not None:
+            self._cube_tensor_list[2][:] = 1
+            self._env_n_obbs[:] = 0
 
     def get_voxels_in_bounding_box(
         self,
diff --git a/src/curobo/geom/sdf/world_blox.py b/src/curobo/geom/sdf/world_blox.py
index 3144866..e844637 100644
--- a/src/curobo/geom/sdf/world_blox.py
+++ b/src/curobo/geom/sdf/world_blox.py
@@ -55,7 +55,7 @@ class WorldBloxCollision(WorldMeshCollision):
         self._blox_voxel_sizes = [0.02]
         super().__init__(config)
 
-    def load_collision_model(self, world_model: WorldConfig):
+    def load_collision_model(self, world_model: WorldConfig, fix_cache_reference: bool = False):
         # load nvblox mesh
         if len(world_model.blox) > 0:
             # check if there is a mapper instance:
@@ -109,15 +109,17 @@ class WorldBloxCollision(WorldMeshCollision):
             self._blox_names = names
             self.collision_types["blox"] = True
 
-        return super().load_collision_model(world_model)
+        return super().load_collision_model(world_model, fix_cache_reference=fix_cache_reference)
 
     def clear_cache(self):
         self._blox_mapper.clear()
+        self._blox_mapper.update_hashmaps()
         super().clear_cache()
 
     def clear_blox_layer(self, layer_name: str):
         index = self._blox_names.index(layer_name)
         self._blox_mapper.clear(index)
+        self._blox_mapper.update_hashmaps()
 
     def _get_blox_sdf(
         self,
@@ -147,6 +149,7 @@ class WorldBloxCollision(WorldMeshCollision):
         speed_dt,
         sweep_steps,
         enable_speed_metric,
+        return_loss=False,
     ):
         d = self._blox_mapper.query_sphere_trajectory_sdf_cost(
             query_spheres,
@@ -160,6 +163,8 @@ class WorldBloxCollision(WorldMeshCollision):
             self._blox_tensor_list[1],
             sweep_steps,
             enable_speed_metric,
+            return_loss,
+            use_experimental=False,
         )
         return d
 
@@ -279,6 +284,7 @@ class WorldBloxCollision(WorldMeshCollision):
             speed_dt=speed_dt,
             sweep_steps=sweep_steps,
             enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
         )
 
         if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
@@ -332,6 +338,7 @@ class WorldBloxCollision(WorldMeshCollision):
             speed_dt=speed_dt,
             sweep_steps=sweep_steps,
             enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
         )
 
         if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
diff --git a/src/curobo/geom/sdf/world_mesh.py b/src/curobo/geom/sdf/world_mesh.py
index 5d8035e..3bf32ad 100644
--- a/src/curobo/geom/sdf/world_mesh.py
+++ b/src/curobo/geom/sdf/world_mesh.py
@@ -72,7 +72,11 @@ class WorldMeshCollision(WorldPrimitiveCollision):
         return super()._init_cache()
 
     def load_collision_model(
-        self, world_model: WorldConfig, env_idx: int = 0, load_obb_obs: bool = True
+        self,
+        world_model: WorldConfig,
+        env_idx: int = 0,
+        load_obb_obs: bool = True,
+        fix_cache_reference: bool = False,
     ):
         max_nmesh = len(world_model.mesh)
         if max_nmesh > 0:
@@ -91,14 +95,16 @@ class WorldMeshCollision(WorldPrimitiveCollision):
 
             self.collision_types["mesh"] = True
         if load_obb_obs:
-            super().load_collision_model(world_model, env_idx)
+            super().load_collision_model(
+                world_model, env_idx, fix_cache_reference=fix_cache_reference
+            )
         else:
             self.world_model = world_model
 
     def load_batch_collision_model(self, world_config_list: List[WorldConfig]):
         max_nmesh = max([len(x.mesh) for x in world_config_list])
         if self._mesh_tensor_list is None or self._mesh_tensor_list[0].shape[1] < max_nmesh:
-            log_info("Creating new Mesh cache: " + str(max_nmesh))
+            log_warn("Creating new Mesh cache: " + str(max_nmesh))
             self._create_mesh_cache(max_nmesh)
 
         for env_idx, world_model in enumerate(world_config_list):
diff --git a/src/curobo/geom/transform.py b/src/curobo/geom/transform.py
index cf802cc..1ab70d6 100644
--- a/src/curobo/geom/transform.py
+++ b/src/curobo/geom/transform.py
@@ -228,6 +228,61 @@ def pose_inverse(
     return out_position, out_quaternion
 
 
+@wp.kernel
+def compute_pose_inverse(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    out_position: wp.array(dtype=wp.vec3),
+    out_quat: wp.array(dtype=wp.vec4),
+):  # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+    t_3 = wp.transform_inverse(t_1)
+
+    # write pt:
+    out_q = wp.transform_get_rotation(t_3)
+
+    out_v = wp.vec4()
+    out_v[0] = out_q[3]  # out_q[3]
+    out_v[1] = out_q[0]  # [0]
+    out_v[2] = out_q[1]  # wp.extract(out_q, 1)
+    out_v[3] = out_q[2]  # wp.extract(out_q, 2)
+
+    out_position[b_idx] = wp.transform_get_translation(t_3)
+    out_quat[b_idx] = out_v
+
+
+@wp.kernel
+def compute_matrix_to_quat(
+    in_mat: wp.array(dtype=wp.mat33),
+    out_quat: wp.array(dtype=wp.vec4),
+):
+    # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_m = in_mat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    out_q = wp.quat_from_matrix(in_m)
+
+    out_v = wp.vec4()
+    out_v[0] = out_q[3]  # wp.extract(out_q, 3)
+    out_v[1] = out_q[0]  # wp.extract(out_q, 0)
+    out_v[2] = out_q[1]  # wp.extract(out_q, 1)
+    out_v[3] = out_q[2]  # wp.extract(out_q, 2)
+    # write pt:
+    out_quat[b_idx] = out_v
+
+
 @wp.kernel
 def compute_transform_point(
     position: wp.array(dtype=wp.vec3),
@@ -331,37 +386,6 @@ def compute_batch_pose_multipy(
     out_quat[b_idx] = out_v
 
 
-@wp.kernel
-def compute_pose_inverse(
-    position: wp.array(dtype=wp.vec3),
-    quat: wp.array(dtype=wp.vec4),
-    out_position: wp.array(dtype=wp.vec3),
-    out_quat: wp.array(dtype=wp.vec4),
-):  # b pose_1 and b pose_2, compute pose_1 * pose_2
-    b_idx = wp.tid()
-    # read data:
-
-    in_position = position[b_idx]
-    in_quat = quat[b_idx]
-
-    # read point
-    # create a transform from a vector/quaternion:
-    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
-    t_3 = wp.transform_inverse(t_1)
-
-    # write pt:
-    out_q = wp.transform_get_rotation(t_3)
-
-    out_v = wp.vec4()
-    out_v[0] = out_q[3]
-    out_v[1] = out_q[0]
-    out_v[2] = out_q[1]
-    out_v[3] = out_q[2]
-
-    out_position[b_idx] = wp.transform_get_translation(t_3)
-    out_quat[b_idx] = out_v
-
-
 @wp.kernel
 def compute_quat_to_matrix(
     quat: wp.array(dtype=wp.vec4),
@@ -382,30 +406,6 @@ def compute_quat_to_matrix(
     out_mat[b_idx] = m_1
 
 
-@wp.kernel
-def compute_matrix_to_quat(
-    in_mat: wp.array(dtype=wp.mat33),
-    out_quat: wp.array(dtype=wp.vec4),
-):
-    # b pose_1 and b pose_2, compute pose_1 * pose_2
-    b_idx = wp.tid()
-    # read data:
-
-    in_m = in_mat[b_idx]
-
-    # read point
-    # create a transform from a vector/quaternion:
-    out_q = wp.quat_from_matrix(in_m)
-
-    out_v = wp.vec4()
-    out_v[0] = out_q[3]
-    out_v[1] = out_q[0]
-    out_v[2] = out_q[1]
-    out_v[3] = out_q[2]
-    # write pt:
-    out_quat[b_idx] = out_v
-
-
 @wp.kernel
 def compute_pose_multipy(
     position: wp.array(dtype=wp.vec3),
@@ -962,6 +962,7 @@ class PoseInverse(torch.autograd.Function):
             adj_quaternion,
         )
         ctx.b = b
+
         wp.launch(
             kernel=compute_pose_inverse,
             dim=b,
@@ -1071,6 +1072,7 @@ class QuatToMatrix(torch.autograd.Function):
             adj_quaternion,
         )
         ctx.b = b
+
         wp.launch(
             kernel=compute_quat_to_matrix,
             dim=b,
@@ -1153,6 +1155,7 @@ class MatrixToQuaternion(torch.autograd.Function):
             adj_mat,
         )
         ctx.b = b
+
         wp.launch(
             kernel=compute_matrix_to_quat,
             dim=b,
diff --git a/src/curobo/geom/types.py b/src/curobo/geom/types.py
index 9622a43..08bd604 100644
--- a/src/curobo/geom/types.py
+++ b/src/curobo/geom/types.py
@@ -12,7 +12,7 @@ from __future__ import annotations
 
 # Standard Library
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, Dict, List, Optional, Sequence, Union
 
 # Third Party
 import numpy as np
@@ -22,6 +22,7 @@ import trimesh
 # CuRobo
 from curobo.geom.sphere_fit import SphereFitType, fit_spheres_to_mesh
 from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
 from curobo.types.math import Pose
 from curobo.util.logger import log_error, log_warn
 from curobo.util_file import get_assets_path, join_path
@@ -60,7 +61,7 @@ class Obstacle:
 
     tensor_args: TensorDeviceType = TensorDeviceType()
 
-    def get_trimesh_mesh(self, process: bool = True) -> trimesh.Trimesh:
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True) -> trimesh.Trimesh:
         """Create a trimesh instance from the obstacle representation.
 
         Args:
@@ -74,8 +75,11 @@ class Obstacle:
         """
         raise NotImplementedError
 
-    def save_as_mesh(self, file_path: str):
+    def save_as_mesh(self, file_path: str, transform_with_pose: bool = False):
         mesh_scene = self.get_trimesh_mesh()
+        if transform_with_pose:
+            mesh_scene.apply_transform(self.get_transform_matrix())
+
         mesh_scene.export(file_path)
 
     def get_cuboid(self) -> Cuboid:
@@ -238,7 +242,7 @@ class Cuboid(Obstacle):
         if self.pose is None:
             log_error("Cuboid Obstacle requires Pose")
 
-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
         m = trimesh.creation.box(extents=self.dims)
         if self.color is not None:
             color_visual = trimesh.visual.color.ColorVisuals(
@@ -254,7 +258,7 @@ class Capsule(Obstacle):
     base: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
     tip: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
 
-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
         height = self.tip[2] - self.base[2]
         if (
             height < 0
@@ -280,7 +284,7 @@ class Cylinder(Obstacle):
     radius: float = 0.0
     height: float = 0.0
 
-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
         m = trimesh.creation.cylinder(radius=self.radius, height=self.height)
         if self.color is not None:
             color_visual = trimesh.visual.color.ColorVisuals(
@@ -304,7 +308,7 @@ class Sphere(Obstacle):
         if self.pose is not None:
             self.position = self.pose[:3]
 
-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
         m = trimesh.creation.icosphere(radius=self.radius)
         if self.color is not None:
             color_visual = trimesh.visual.color.ColorVisuals(
@@ -356,9 +360,9 @@ class Sphere(Obstacle):
 @dataclass
 class Mesh(Obstacle):
     file_path: Optional[str] = None
-    file_string: Optional[
-        str
-    ] = None  # storing full mesh as a string, loading from this is not implemented yet.
+    file_string: Optional[str] = (
+        None  # storing full mesh as a string, loading from this is not implemented yet.
+    )
     urdf_path: Optional[str] = None  # useful for visualization in isaac gym.
     vertices: Optional[List[List[float]]] = None
     faces: Optional[List[int]] = None
@@ -375,13 +379,13 @@ class Mesh(Obstacle):
             self.vertices = np.ravel(self.scale) * self.vertices
             self.scale = None
 
-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
         # load mesh from filepath or verts and faces:
         if self.file_path is not None:
             m = trimesh.load(self.file_path, process=process, force="mesh")
             if isinstance(m, trimesh.Scene):
                 m = m.dump(concatenate=True)
-            if isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
+            if process_color and isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
                 m.visual = m.visual.to_color()
         else:
             m = trimesh.Trimesh(
@@ -467,7 +471,7 @@ class BloxMap(Obstacle):
                 name=self.name + "_mesh", file_path=self.mesh_file_path, pose=self.pose
             )
 
-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
         if self.mesh is not None:
             return self.mesh.get_trimesh_mesh(process)
         else:
@@ -475,6 +479,91 @@ class BloxMap(Obstacle):
             return None
 
 
+@dataclass
+class PointCloud(Obstacle):
+    points: Union[torch.Tensor, np.ndarray, List[List[float]]] = None
+    points_features: Union[torch.Tensor, np.ndarray, List[List[float]], None] = None
+
+    def __post_init__(self):
+        if self.scale is not None and self.points is not None:
+            self.points = np.ravel(self.scale) * self.points
+            self.scale = None
+
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
+        points = self.points
+        if isinstance(points, torch.Tensor):
+            points = points.view(-1, 3).cpu().numpy()
+        if isinstance(points, list):
+            points = np.ndarray(points)
+
+        mesh = Mesh.from_pointcloud(points, pose=self.pose)
+        return mesh.get_trimesh_mesh()
+
+    def get_mesh_data(self, process: bool = True):
+        verts = faces = None
+        m = self.get_trimesh_mesh(process=process)
+        verts = m.vertices.view(np.ndarray)
+        faces = m.faces
+        return verts, faces
+
+    @staticmethod
+    def from_camera_observation(
+        camera_obs: CameraObservation, name: str = "pc_obstacle", pose: Optional[List[float]] = None
+    ):
+        return PointCloud(name=name, pose=pose, points=camera_obs.get_pointcloud())
+
+    def get_bounding_spheres(
+        self,
+        n_spheres: int = 1,
+        surface_sphere_radius: float = 0.002,
+        fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        pre_transform_pose: Optional[Pose] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> List[Sphere]:
+        """Compute n spheres that fits in the volume of the object.
+
+        Args:
+            n: number of spheres
+        Returns:
+            spheres
+        """
+        # sample points in pointcloud:
+
+        # mesh = self.get_trimesh_mesh()
+        # pts, n_radius = fit_spheres_to_mesh(
+        #     mesh, n_spheres, surface_sphere_radius, fit_type, voxelize_method=voxelize_method
+        # )
+
+        obj_pose = Pose.from_list(self.pose, tensor_args)
+        # transform object:
+
+        # transform points:
+        if pre_transform_pose is not None:
+            obj_pose = pre_transform_pose.multiply(obj_pose)  # convert object pose to another frame
+
+        if pts is None or len(pts) == 0:
+            log_warn("spheres could not be fit!, adding one sphere at origin")
+            pts = np.zeros((1, 3))
+            pts[0, :] = mesh.centroid
+            n_radius = [0.02]
+            obj_pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0], tensor_args)
+
+        points_cuda = tensor_args.to_device(pts)
+        pts = obj_pose.transform_points(points_cuda).cpu().view(-1, 3).numpy()
+
+        new_spheres = [
+            Sphere(
+                name="sph_" + str(i),
+                pose=[pts[i, 0], pts[i, 1], pts[i, 2], 1, 0, 0, 0],
+                radius=n_radius[i],
+            )
+            for i in range(pts.shape[0])
+        ]
+
+        return new_spheres
+
+
 @dataclass
 class WorldConfig(Sequence):
     """Representation of World for use in CuRobo."""
@@ -664,23 +753,25 @@ class WorldConfig(Sequence):
         )
 
     @staticmethod
-    def get_scene_graph(current_world: WorldConfig):
+    def get_scene_graph(current_world: WorldConfig, process_color: bool = True):
         m_world = WorldConfig.create_mesh_world(current_world)
-        mesh_scene = trimesh.scene.scene.Scene(base_frame="world")
+        mesh_scene = trimesh.scene.scene.Scene(base_frame="world_origin")
         mesh_list = m_world
         for m in mesh_list:
             mesh_scene.add_geometry(
-                m.get_trimesh_mesh(),
+                m.get_trimesh_mesh(process_color=process_color),
                 geom_name=m.name,
-                parent_node_name="world",
+                parent_node_name="world_origin",
                 transform=m.get_transform_matrix(),
             )
 
         return mesh_scene
 
     @staticmethod
-    def create_merged_mesh_world(current_world: WorldConfig, process: bool = True):
-        mesh_scene = WorldConfig.get_scene_graph(current_world)
+    def create_merged_mesh_world(
+        current_world: WorldConfig, process: bool = True, process_color: bool = True
+    ):
+        mesh_scene = WorldConfig.get_scene_graph(current_world, process_color=process_color)
         mesh_scene = mesh_scene.dump(concatenate=True)
         new_mesh = Mesh(
             vertices=mesh_scene.vertices.view(np.ndarray),
@@ -702,8 +793,10 @@ class WorldConfig(Sequence):
     def get_collision_check_world(self, mesh_process: bool = False):
         return WorldConfig.create_collision_support_world(self, process=mesh_process)
 
-    def save_world_as_mesh(self, file_path: str, save_as_scene_graph=False):
-        mesh_scene = WorldConfig.get_scene_graph(self)
+    def save_world_as_mesh(
+        self, file_path: str, save_as_scene_graph=False, process_color: bool = True
+    ):
+        mesh_scene = WorldConfig.get_scene_graph(self, process_color=process_color)
         if save_as_scene_graph:
             mesh_scene = mesh_scene.dump(concatenate=True)
 
diff --git a/src/curobo/graph/graph_nx.py b/src/curobo/graph/graph_nx.py
index 86ec22e..28cb3d1 100644
--- a/src/curobo/graph/graph_nx.py
+++ b/src/curobo/graph/graph_nx.py
@@ -10,10 +10,19 @@
 #
 
 
+# Standard Library
+import random
+
 # Third Party
 import networkx as nx
+import numpy as np
 import torch
 
+# This is needed to get deterministic results from networkx.
+# Note: it has to be set in global space.
+np.random.seed(2)
+random.seed(2)
+
 
 class NetworkxGraph(object):
     def __init__(self):
diff --git a/src/curobo/opt/__init__.py b/src/curobo/opt/__init__.py
index a08745f..7ec32c3 100644
--- a/src/curobo/opt/__init__.py
+++ b/src/curobo/opt/__init__.py
@@ -8,3 +8,15 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
+
+""" Optimization module containing several numerical solvers. 
+
+Base for an opimization solver is at :class:`opt_base.Optimizer`. cuRobo provides two base classes 
+for implementing two popular ways to optimize, (1) using particles 
+with :class:`particle.particle_opt_base.ParticleOptBase` and (2) using Newton/Quasi-Newton solvers
+with :class:`newton.newton_base.NewtonOptBase`. :class:`newton.newton_base.NewtonOptBase` contains 
+implementations of several line search schemes. Note that these line search schemes are approximate
+as cuRobo tries different line search magnitudes in parallel and chooses the largest that satisfies
+line search conditions.
+
+"""
diff --git a/src/curobo/opt/newton/__init__.py b/src/curobo/opt/newton/__init__.py
index 3bc9b89..b1cf5d6 100644
--- a/src/curobo/opt/newton/__init__.py
+++ b/src/curobo/opt/newton/__init__.py
@@ -10,5 +10,5 @@
 #
 
 """
-This module contains code for cuda accelerated kinematics.
+This module contains Newton/Quasi-Newton solvers.
 """
diff --git a/src/curobo/opt/newton/lbfgs.py b/src/curobo/opt/newton/lbfgs.py
index f5f2c46..2017ee9 100644
--- a/src/curobo/opt/newton/lbfgs.py
+++ b/src/curobo/opt/newton/lbfgs.py
@@ -95,9 +95,9 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig):
         self.step_q_buffer[:] = 0.0
         return super().reset()
 
-    def update_nenvs(self, n_envs):
-        self.init_hessian(b=n_envs)
-        return super().update_nenvs(n_envs)
+    def update_nproblems(self, n_problems):
+        self.init_hessian(b=n_problems)
+        return super().update_nproblems(n_problems)
 
     def init_hessian(self, b=1):
         self.x_0 = torch.zeros(
diff --git a/src/curobo/opt/newton/newton_base.py b/src/curobo/opt/newton/newton_base.py
index e406d8f..62eaf21 100644
--- a/src/curobo/opt/newton/newton_base.py
+++ b/src/curobo/opt/newton/newton_base.py
@@ -79,7 +79,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
         self.outer_iters = math.ceil(self.n_iters / self.inner_iters)
 
         # create line search
-        self.update_nenvs(self.n_envs)
+        self.update_nproblems(self.n_problems)
 
         self.reset()
 
@@ -135,7 +135,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
         if self.store_debug:
             self.debug.append(q.view(-1, self.action_horizon, self.d_action).clone())
         with profiler.record_function("newton_base/init_opt"):
-            q = q.view(self.n_envs, self.action_horizon * self.d_action)
+            q = q.view(self.n_problems, self.action_horizon * self.d_action)
             grad_q = q.detach() * 0.0
         # run opt graph
         if not self.cu_opt_init:
@@ -150,7 +150,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
                 if check_convergence(self.best_iteration, self.current_iteration, self.last_best):
                     break
 
-        best_q = best_q.view(self.n_envs, self.action_horizon, self.d_action)
+        best_q = best_q.view(self.n_problems, self.action_horizon, self.d_action)
         return best_q
 
     def reset(self):
@@ -171,9 +171,6 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
         if self.store_debug:
             self.debug.append(self.best_q.view(-1, self.action_horizon, self.d_action).clone())
             self.debug_cost.append(self.best_cost.detach().view(-1, 1).clone())
-            # self.debug.append(q.view(-1, self.action_horizon, self.d_action).clone())
-            # self.debug_cost.append(cost_n.detach().view(-1, 1).clone())
-            # print(grad_q)
 
         return self.best_q.detach(), self.best_cost.detach(), q.detach(), grad_q.detach()
 
@@ -222,11 +219,11 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
     def _compute_cost_gradient(self, x):
         x_n = x.detach().requires_grad_(True)
         x_in = x_n.view(
-            self.n_envs * self.num_particles, self.action_horizon, self.rollout_fn.d_action
+            self.n_problems * self.num_particles, self.action_horizon, self.rollout_fn.d_action
         )
         trajectories = self.rollout_fn(x_in)  # x_n = (batch*line_search_scale) x horizon x d_action
         cost = torch.sum(
-            trajectories.costs.view(self.n_envs, self.num_particles, self.horizon),
+            trajectories.costs.view(self.n_problems, self.num_particles, self.horizon),
             dim=-1,
             keepdim=True,
         )
@@ -235,7 +232,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
         return (
             cost,
             g_x,
-        )  # cost: [n_envs, n_particles, 1], g_x: [n_envs, n_particles, horizon*d_action]
+        )  # cost: [n_problems, n_particles, 1], g_x: [n_problems, n_particles, horizon*d_action]
 
     def _wolfe_line_search(self, x, step_direction):
         # x_set = get_x_set_jit(step_direction, x, self.alpha_list, self.action_lows, self.action_highs)
@@ -455,36 +452,40 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
             mask_q = mask.unsqueeze(-1).expand(-1, self.d_opt)
             self.best_q.copy_(torch.where(mask_q, q, self.best_q))
 
-    def update_nenvs(self, n_envs):
+    def update_nproblems(self, n_problems):
         self.l_vec = torch.ones(
-            (n_envs, self.num_particles, 1),
+            (n_problems, self.num_particles, 1),
             device=self.tensor_args.device,
             dtype=self.tensor_args.dtype,
         )
         self.best_cost = (
-            torch.ones((n_envs, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+            torch.ones(
+                (n_problems, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
             * 5000000.0
         )
         self.best_q = torch.zeros(
-            (n_envs, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            (n_problems, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
         )
         self.best_grad_q = torch.zeros(
-            (n_envs, 1, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            (n_problems, 1, self.d_opt),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
         )
 
         # create list:
-        self.alpha_list = self.line_scale.repeat(n_envs, 1, 1)
+        self.alpha_list = self.line_scale.repeat(n_problems, 1, 1)
         self.zero_alpha_list = self.alpha_list[:, :, 0:1].contiguous()
         h = self.alpha_list.shape[1]
         self.c_idx = torch.arange(
-            0, n_envs * h, step=(h), device=self.tensor_args.device, dtype=torch.long
+            0, n_problems * h, step=(h), device=self.tensor_args.device, dtype=torch.long
         )
         self.best_iteration = torch.zeros(
-            (n_envs), device=self.tensor_args.device, dtype=torch.int16
+            (n_problems), device=self.tensor_args.device, dtype=torch.int16
         )
         self.current_iteration = torch.zeros((1), device=self.tensor_args.device, dtype=torch.int16)
         self.cu_opt_init = False
-        super().update_nenvs(n_envs)
+        super().update_nproblems(n_problems)
 
     def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
         if self.use_cuda_graph:
diff --git a/src/curobo/opt/opt_base.py b/src/curobo/opt/opt_base.py
index e5fb7d3..f105685 100644
--- a/src/curobo/opt/opt_base.py
+++ b/src/curobo/opt/opt_base.py
@@ -8,6 +8,9 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
+""" Base module for Optimization. """
+from __future__ import annotations
+
 # Standard Library
 import time
 from abc import abstractmethod
@@ -28,31 +31,75 @@ from curobo.util.torch_utils import is_cuda_graph_available
 
 @dataclass
 class OptimizerConfig:
+    """Configuration for an :meth:`Optimizer`."""
+
+    #: Number of optimization variables per timestep.
     d_action: int
+
+    #: Lower bound for optimization variables.
     action_lows: List[float]
+
+    #: Higher bound for optimization variables
     action_highs: List[float]
-    horizon: int
-    n_iters: int
-    rollout_fn: RolloutBase
-    tensor_args: TensorDeviceType
-    use_cuda_graph: bool
-    store_debug: bool
-    debug_info: Any
-    cold_start_n_iters: int
-    num_particles: Union[int, None]
-    n_envs: int
-    sync_cuda_time: bool
-    use_coo_sparse: bool
+
+    #:
     action_horizon: int
 
+    #: Number of timesteps in optimization state, total variables = d_action * horizon
+    horizon: int
+
+    #: Number of iterations to run optimization
+    n_iters: int
+
+    #: Number of iterations to run optimization during the first instance. Setting to None will
+    #: use n_iters. This parameter is useful in MPC like settings where we need to run many
+    #: iterations during initialization (cold start) and then run only few iterations (warm start).
+    cold_start_n_iters: Union[int, None]
+
+    #: Rollout function to use for computing cost, given optimization variables.
+    rollout_fn: RolloutBase
+
+    #: Tensor device to use for optimization.
+    tensor_args: TensorDeviceType
+
+    #: Capture optimization iteration in a cuda graph and replay graph instead of eager execution.
+    #: Enabling this can make optimization 10x faster. But changing control flow, tensor
+    #: shapes, or problem type is not allowed.
+    use_cuda_graph: bool
+
+    #: Record debugging data such as optimization variables, and cost at every iteration. Enabling
+    #: this will disable cuda graph.
+    store_debug: bool
+
+    #: Use this to record additional attributes from rollouts.
+    debug_info: Any
+
+    #: Number of parallel problems to optimize.
+    n_problems: int
+
+    #: Number of particles to use per problem. Common optimization solvers use many particles to
+    #: optimize a single problem. E.g., MPPI rolls out many parallel samples and computes a weighted
+    #: mean. In cuRobo, Quasi-Newton solvers use particles to run many line search magnitudes.
+    #: Total optimization batch size = n_problems * num_particles.
+    num_particles: Union[int, None]
+
+    #: Synchronize device before computing solver time.
+    sync_cuda_time: bool
+
+    #: Matmul with a Sparse tensor is used to create particles for each problem index to save memory
+    #: and compute. Some versions of pytorch do not support coo sparse, specifically during
+    #: torch profile runs. Set this to False to use a standard tensor.
+    use_coo_sparse: bool
+
     def __post_init__(self):
         object.__setattr__(self, "action_highs", self.tensor_args.to_device(self.action_highs))
         object.__setattr__(self, "action_lows", self.tensor_args.to_device(self.action_lows))
-        # check cuda graph version:
         if self.use_cuda_graph:
             self.use_cuda_graph = is_cuda_graph_available()
         if self.num_particles is None:
             self.num_particles = 1
+        if self.cold_start_n_iters is None:
+            self.cold_start_n_iters = self.n_iters
 
     @staticmethod
     def create_data_dict(
@@ -61,6 +108,17 @@ class OptimizerConfig:
         tensor_args: TensorDeviceType = TensorDeviceType(),
         child_dict: Optional[Dict] = None,
     ):
+        """Helper function to create dictionary from optimizer parameters and rollout class.
+
+        Args:
+            data_dict: optimizer parameters dictionary.
+            rollout_fn: rollout function.
+            tensor_args: tensor cuda device.
+            child_dict: new dictionary where parameters will be stored.
+
+        Returns:
+            Dictionary with parameters to create a :meth:`OptimizerConfig`
+        """
         if child_dict is None:
             child_dict = deepcopy(data_dict)
         child_dict["d_action"] = rollout_fn.d_action
@@ -78,17 +136,33 @@ class OptimizerConfig:
 
 class Optimizer(OptimizerConfig):
     def __init__(self, config: Optional[OptimizerConfig] = None) -> None:
+        """Base optimization solver class
+
+        Args:
+            config: Initialized with parameters from a dataclass.
+        """
         if config is not None:
             super().__init__(**vars(config))
         self.opt_dt = 0.0
         self.COLD_START = True
-        self.update_nenvs(self.n_envs)
+        self.update_nproblems(self.n_problems)
         self._batch_goal = None
         self._rollout_list = None
         self.debug = []
         self.debug_cost = []
 
     def optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
+        """Find a solution through optimization given the initial values for variables.
+
+        Args:
+            opt_tensor: Initial value of optimization variables.
+                        Shape: [n_problems, action_horizon, d_action]
+            shift_steps: Shift variables along action_horizon. Useful in mpc warm-start setting.
+            n_iters: Override number of iterations to run optimization.
+
+        Returns:
+            Optimized values returned as a tensor of shape [n_problems, action_horizon, d_action].
+        """
         if self.COLD_START:
             n_iters = self.cold_start_n_iters
             self.COLD_START = False
@@ -99,17 +173,12 @@ class Optimizer(OptimizerConfig):
         self.opt_dt = time.time() - st_time
         return out
 
-    @abstractmethod
-    def _optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
-        pass
-
-    def _shift(self, shift_steps=0):
-        """
-        Shift the variables in the solver to hotstart the next timestep
-        """
-        return
-
     def update_params(self, goal: Goal):
+        """Update parameters in the :meth:`curobo.rollout.rollout_base.RolloutBase` instance.
+
+        Args:
+            goal: parameters to update rollout instance.
+        """
         with profiler.record_function("OptBase/batch_goal"):
             if self._batch_goal is not None:
                 self._batch_goal.copy_(goal, update_idx_buffers=True)  # why True?
@@ -118,80 +187,37 @@ class Optimizer(OptimizerConfig):
         self.rollout_fn.update_params(self._batch_goal)
 
     def reset(self):
-        """
-        Reset the controller
-        """
+        """Reset optimizer."""
         self.rollout_fn.reset()
         self.debug = []
         self.debug_cost = []
-        # self.COLD_START = True
 
-    def update_nenvs(self, n_envs):
-        assert n_envs > 0
-        self._update_env_kernel(n_envs, self.num_particles)
-        self.n_envs = n_envs
+    def update_nproblems(self, n_problems: int):
+        """Update the number of problems that need to be optimized.
 
-    def _update_env_kernel(self, n_envs, num_particles):
-        log_info(
-            "Updating env kernel [n_envs: "
-            + str(n_envs)
-            + " , num_particles: "
-            + str(num_particles)
-            + " ]"
-        )
+        Args:
+            n_problems: number of problems.
+        """
+        assert n_problems > 0
+        self._update_problem_kernel(n_problems, self.num_particles)
+        self.n_problems = n_problems
 
-        self.env_col = torch.arange(
-            0, n_envs, step=1, dtype=torch.long, device=self.tensor_args.device
-        )
-        self.n_select_ = torch.tensor(
-            [x * n_envs + x for x in range(n_envs)],
-            device=self.tensor_args.device,
-            dtype=torch.long,
-        )
-
-        # create sparse tensor:
-        sparse_indices = []
-        for i in range(n_envs):
-            sparse_indices.extend([[i * num_particles + x, i] for x in range(num_particles)])
-
-        sparse_values = torch.ones(len(sparse_indices))
-        sparse_indices = torch.tensor(sparse_indices)
-        if self.use_coo_sparse:
-            self.env_kernel_ = torch.sparse_coo_tensor(
-                sparse_indices.t(),
-                sparse_values,
-                device=self.tensor_args.device,
-                dtype=self.tensor_args.dtype,
-            )
-        else:
-            self.env_kernel_ = torch.sparse_coo_tensor(
-                sparse_indices.t(),
-                sparse_values,
-                device="cpu",
-                dtype=self.tensor_args.dtype,
-            )
-            self.env_kernel_ = self.env_kernel_.to_dense().to(device=self.tensor_args.device)
-        self._env_seeds = self.num_particles
-
-    def get_nenv_tensor(self, x):
-        """This function takes an input tensor of shape (n_env,....) and converts it into
-        (n_particles,...)
+    def get_nproblem_tensor(self, x):
+        """This function takes an input tensor of shape (n_problem,....) and converts it into
+        (n_particles,...).
         """
 
-        # if x.shape[0] != self.n_envs:
-        #    x_env = x.unsqueeze(0).repeat(self.n_envs, 1)
-        # else:
-        #    x_env = x
-
         # create a tensor
-        nx_env = self.env_kernel_ @ x
+        nx_problem = self.problem_kernel_ @ x
 
-        return nx_env
+        return nx_problem
 
     def reset_seed(self):
+        """Reset seeds."""
         return True
 
     def reset_cuda_graph(self):
+        """Reset CUDA Graphs. This does not work, workaround is to create a new instance."""
         if self.use_cuda_graph:
             self.cu_opt_init = False
         else:
@@ -199,7 +225,87 @@ class Optimizer(OptimizerConfig):
         self._batch_goal = None
         self.rollout_fn.reset_cuda_graph()
 
+    def reset_shape(self):
+        """Reset any flags in rollout class. Useful to reinitialize tensors for a new shape."""
+        self.rollout_fn.reset_shape()
+        self._batch_goal = None
+
     def get_all_rollout_instances(self) -> List[RolloutBase]:
+        """Get all instances of Rollout class in the optimizer."""
         if self._rollout_list is None:
             self._rollout_list = [self.rollout_fn]
         return self._rollout_list
+
+    @abstractmethod
+    def _optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
+        """Implement this function in a derived class containing the solver.
+
+        Args:
+            opt_tensor: Initial value of optimization variables.
+                        Shape: [n_problems, action_horizon, d_action]
+            shift_steps: Shift variables along action_horizon. Useful in mpc warm-start setting.
+            n_iters: Override number of iterations to run optimization.
+
+        Returns:
+            Optimized variables in tensor shape [action_horizon, d_action].
+        """
+        return opt_tensor
+
+    @abstractmethod
+    def _shift(self, shift_steps=0):
+        """Shift the variables in the solver to hotstart the next timestep.
+
+        Args:
+            shift_steps: Number of timesteps to shift.
+        """
+        return
+
+    def _update_problem_kernel(self, n_problems: int, num_particles: int):
+        """Update matrix used to map problem index to number of particles.
+
+        Args:
+            n_problems: Number of optimization problems.
+            num_particles: Number of particles per problem.
+        """
+        log_info(
+            "Updating problem kernel [n_problems: "
+            + str(n_problems)
+            + " , num_particles: "
+            + str(num_particles)
+            + " ]"
+        )
+
+        self.problem_col = torch.arange(
+            0, n_problems, step=1, dtype=torch.long, device=self.tensor_args.device
+        )
+        self.n_select_ = torch.tensor(
+            [x * n_problems + x for x in range(n_problems)],
+            device=self.tensor_args.device,
+            dtype=torch.long,
+        )
+
+        # create sparse tensor:
+        sparse_indices = []
+        for i in range(n_problems):
+            sparse_indices.extend([[i * num_particles + x, i] for x in range(num_particles)])
+
+        sparse_values = torch.ones(len(sparse_indices))
+        sparse_indices = torch.tensor(sparse_indices)
+        if self.use_coo_sparse:
+            self.problem_kernel_ = torch.sparse_coo_tensor(
+                sparse_indices.t(),
+                sparse_values,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        else:
+            self.problem_kernel_ = torch.sparse_coo_tensor(
+                sparse_indices.t(),
+                sparse_values,
+                device="cpu",
+                dtype=self.tensor_args.dtype,
+            )
+            self.problem_kernel_ = self.problem_kernel_.to_dense().to(
+                device=self.tensor_args.device
+            )
+        self._problem_seeds = self.num_particles
diff --git a/src/curobo/opt/particle/parallel_mppi.py b/src/curobo/opt/particle/parallel_mppi.py
index 109a834..7d728da 100644
--- a/src/curobo/opt/particle/parallel_mppi.py
+++ b/src/curobo/opt/particle/parallel_mppi.py
@@ -65,7 +65,7 @@ class ParallelMPPIConfig(ParticleOptConfig):
     alpha: float
     gamma: float
     kappa: float
-    sample_per_env: bool
+    sample_per_problem: bool
 
     def __post_init__(self):
         self.init_cov = self.tensor_args.to_device(self.init_cov).unsqueeze(0)
@@ -264,7 +264,7 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
         actions = trajectories.actions
 
         total_costs = self._compute_total_cost(costs)
-        # Let's reshape to n_envs now:
+        # Let's reshape to n_problems now:
 
         # first find the means before doing exponential utility:
         w = self._exp_util(total_costs)
@@ -272,7 +272,7 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
         # Update best action
         if self.sample_mode == SampleMode.BEST:
             best_idx = torch.argmax(w, dim=-1)
-            self.best_traj.copy_(actions[self.env_col, best_idx])
+            self.best_traj.copy_(actions[self.problem_col, best_idx])
 
         if self.store_rollouts and self.visual_traj is not None:
             vis_seq = getattr(trajectories.state, self.visual_traj)
@@ -281,7 +281,9 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
             self.top_idx = top_idx
             top_trajs = torch.index_select(vis_seq, 0, top_idx[0])
             for i in range(1, top_idx.shape[0]):
-                trajs = torch.index_select(vis_seq, 0, top_idx[i] + (self.particles_per_env * i))
+                trajs = torch.index_select(
+                    vis_seq, 0, top_idx[i] + (self.particles_per_problem * i)
+                )
                 top_trajs = torch.cat((top_trajs, trajs), dim=0)
             if self.top_trajs is None or top_trajs.shape != self.top_trajs:
                 self.top_trajs = top_trajs
@@ -317,13 +319,15 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
         act_seq = self.mean_action.unsqueeze(-3) + scaled_delta
         cat_list = [act_seq]
 
-        if self.neg_per_env > 0:
+        if self.neg_per_problem > 0:
             neg_action = -1.0 * self.mean_action
-            neg_act_seqs = neg_action.unsqueeze(-3).expand(-1, self.neg_per_env, -1, -1)
+            neg_act_seqs = neg_action.unsqueeze(-3).expand(-1, self.neg_per_problem, -1, -1)
             cat_list.append(neg_act_seqs)
-        if self.null_per_env > 0:
+        if self.null_per_problem > 0:
             cat_list.append(
-                self.null_act_seqs[: self.null_per_env].unsqueeze(0).expand(self.n_envs, -1, -1, -1)
+                self.null_act_seqs[: self.null_per_problem]
+                .unsqueeze(0)
+                .expand(self.n_problems, -1, -1, -1)
             )
 
         act_seq = torch.cat(
@@ -343,8 +347,8 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
     def update_init_mean(self, init_mean):
         # update mean:
         # init_mean = init_mean.clone()
-        if init_mean.shape[0] != self.n_envs:
-            init_mean = init_mean.expand(self.n_envs, -1, -1)
+        if init_mean.shape[0] != self.n_problems:
+            init_mean = init_mean.expand(self.n_problems, -1, -1)
         if not copy_tensor(init_mean, self.mean_action):
             self.mean_action = init_mean.clone()
         if not copy_tensor(init_mean, self.best_traj):
@@ -353,27 +357,27 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
     def reset_mean(self):
         with profiler.record_function("mppi/reset_mean"):
             if self.random_mean:
-                mean = self.mean_lib.get_samples([self.n_envs])
+                mean = self.mean_lib.get_samples([self.n_problems])
                 self.update_init_mean(mean)
             else:
                 self.update_init_mean(self.init_mean)
 
     def reset_covariance(self):
         with profiler.record_function("mppi/reset_cov"):
-            # init_cov can either be a single value, or n_envs x 1 or n_envs x d_action
+            # init_cov can either be a single value, or n_problems x 1 or n_problems x d_action
 
             if self.cov_type == CovType.SIGMA_I:
-                # init_cov can either be a single value, or n_envs x 1
+                # init_cov can either be a single value, or n_problems x 1
                 self.cov_action = self.init_cov
-                if self.init_cov.shape[0] != self.n_envs:
-                    self.cov_action = self.init_cov.unsqueeze(0).expand(self.n_envs, -1)
+                if self.init_cov.shape[0] != self.n_problems:
+                    self.cov_action = self.init_cov.unsqueeze(0).expand(self.n_problems, -1)
                 self.inv_cov_action = 1.0 / self.cov_action
                 a = torch.sqrt(self.cov_action)
                 if not copy_tensor(a, self.scale_tril):
                     self.scale_tril = a
 
             elif self.cov_type == CovType.DIAG_A:
-                # init_cov can either be a single value, or n_envs x 1 or n_envs x 7
+                # init_cov can either be a single value, or n_problems x 1 or n_problems x 7
                 init_cov = self.init_cov.clone()
 
                 # if(init_cov.shape[-1] != self.d_action):
@@ -382,8 +386,8 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
                 if len(init_cov.shape) == 2 and init_cov.shape[-1] != self.d_action:
                     init_cov = init_cov.expand(-1, self.d_action)
                 init_cov = init_cov.unsqueeze(1)
-                if init_cov.shape[0] != self.n_envs:
-                    init_cov = init_cov.expand(self.n_envs, -1, -1)
+                if init_cov.shape[0] != self.n_problems:
+                    init_cov = init_cov.expand(self.n_problems, -1, -1)
                 if not copy_tensor(init_cov.clone(), self.cov_action):
                     self.cov_action = init_cov.clone()
                 self.inv_cov_action = 1.0 / self.cov_action
@@ -523,16 +527,18 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
                 n_iters = 1
             else:
                 n_iters = self.n_iters
-            if self.sample_per_env:
+            if self.sample_per_problem:
                 s_set = (
                     self.sample_lib.get_samples(
-                        sample_shape=[self.sampled_particles_per_env * self.n_envs * n_iters],
+                        sample_shape=[
+                            self.sampled_particles_per_problem * self.n_problems * n_iters
+                        ],
                         base_seed=self.seed,
                     )
                     .view(
                         n_iters,
-                        self.n_envs,
-                        self.sampled_particles_per_env,
+                        self.n_problems,
+                        self.sampled_particles_per_problem,
                         self.action_horizon,
                         self.d_action,
                     )
@@ -540,13 +546,17 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
                 )
             else:
                 s_set = self.sample_lib.get_samples(
-                    sample_shape=[n_iters * (self.sampled_particles_per_env)],
+                    sample_shape=[n_iters * (self.sampled_particles_per_problem)],
                     base_seed=self.seed,
                 )
                 s_set = s_set.view(
-                    n_iters, 1, self.sampled_particles_per_env, self.action_horizon, self.d_action
+                    n_iters,
+                    1,
+                    self.sampled_particles_per_problem,
+                    self.action_horizon,
+                    self.d_action,
                 )
-                s_set = s_set.repeat(1, self.n_envs, 1, 1, 1).clone()
+                s_set = s_set.repeat(1, self.n_problems, 1, 1, 1).clone()
             s_set[:, :, -1, :, :] = 0.0
             if not copy_tensor(s_set, self._sample_set):
                 log_info("ParallelMPPI: Updating sample set")
@@ -575,7 +585,7 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
         Parameters
         ----------
         state : dict or np.ndarray
-            Initial state to set the simulation env to
+            Initial state to set the simulation problem to
         """
 
         return super().generate_rollouts(init_act)
diff --git a/src/curobo/opt/particle/particle_opt_base.py b/src/curobo/opt/particle/particle_opt_base.py
index 5f66137..ffb849f 100644
--- a/src/curobo/opt/particle/particle_opt_base.py
+++ b/src/curobo/opt/particle/particle_opt_base.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 #
 # Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
@@ -95,7 +94,7 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
         self.trajectories = None
         self.cu_opt_init = False
         self.info = ParticleOptInfo()
-        self.update_num_particles_per_env(self.num_particles)
+        self.update_num_particles_per_problem(self.num_particles)
 
     @abstractmethod
     def _get_action_seq(self, mode=SampleMode):
@@ -172,7 +171,7 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
         Parameters
         ----------
         state : dict or np.ndarray
-            Initial state to set the simulation env to
+            Initial state to set the simulation problem to
         """
 
         act_seq = self.sample_actions(init_act)
@@ -259,10 +258,10 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
             # generate random simulated trajectories
             trajectory = self.generate_rollouts()
             trajectory.actions = trajectory.actions.view(
-                self.n_envs, self.particles_per_env, self.action_horizon, self.d_action
+                self.n_problems, self.particles_per_problem, self.action_horizon, self.d_action
             )
             trajectory.costs = trajectory.costs.view(
-                self.n_envs, self.particles_per_env, self.horizon
+                self.n_problems, self.particles_per_problem, self.horizon
             )
             with profiler.record_function("mpc/mppi/update_distribution"):
                 self._update_distribution(trajectory)
@@ -275,26 +274,26 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
         curr_action_seq = self._get_action_seq(mode=self.sample_mode)
         return curr_action_seq
 
-    def update_nenvs(self, n_envs):
-        assert n_envs > 0
-        self.total_num_particles = n_envs * self.num_particles
+    def update_nproblems(self, n_problems):
+        assert n_problems > 0
+        self.total_num_particles = n_problems * self.num_particles
         self.cu_opt_init = False
-        super().update_nenvs(n_envs)
+        super().update_nproblems(n_problems)
 
-    def update_num_particles_per_env(self, num_particles_per_env):
-        self.null_per_env = round(int(self.null_act_frac * num_particles_per_env * 0.5))
+    def update_num_particles_per_problem(self, num_particles_per_problem):
+        self.null_per_problem = round(int(self.null_act_frac * num_particles_per_problem * 0.5))
 
-        self.neg_per_env = (
-            round(int(self.null_act_frac * num_particles_per_env)) - self.null_per_env
+        self.neg_per_problem = (
+            round(int(self.null_act_frac * num_particles_per_problem)) - self.null_per_problem
         )
 
-        self.sampled_particles_per_env = (
-            num_particles_per_env - self.null_per_env - self.neg_per_env
+        self.sampled_particles_per_problem = (
+            num_particles_per_problem - self.null_per_problem - self.neg_per_problem
         )
-        self.particles_per_env = num_particles_per_env
-        if self.null_per_env > 0:
+        self.particles_per_problem = num_particles_per_problem
+        if self.null_per_problem > 0:
             self.null_act_seqs = torch.zeros(
-                self.null_per_env,
+                self.null_per_problem,
                 self.action_horizon,
                 self.d_action,
                 device=self.tensor_args.device,
diff --git a/src/curobo/opt/particle/particle_opt_utils.py b/src/curobo/opt/particle/particle_opt_utils.py
index ff72716..d915e81 100644
--- a/src/curobo/opt/particle/particle_opt_utils.py
+++ b/src/curobo/opt/particle/particle_opt_utils.py
@@ -121,7 +121,7 @@ def get_stomp_cov_jit(
                     else:
                         A[k * horizon + i, k * horizon + index] = fd_array[j + 3]
 
-    R = torch.matmul(A.transpose(-2, -1), A)
+    R = torch.matmul(A.transpose(-2, -1).clone(), A.clone())
     M = torch.inverse(R)
     scaled_M = (1 / horizon) * M / (torch.max(torch.abs(M), dim=1)[0].unsqueeze(0))
     cov = M / torch.max(torch.abs(M))
@@ -132,7 +132,6 @@ def get_stomp_cov_jit(
         scale_tril = torch.linalg.cholesky(cov)
     else:
         scale_tril = cov
-
     """
     k = 0
     act_cov_matrix = cov[k * horizon:k * horizon + horizon, k * horizon:k * horizon + horizon]
diff --git a/src/curobo/rollout/arm_base.py b/src/curobo/rollout/arm_base.py
index 43d6b23..603754b 100644
--- a/src/curobo/rollout/arm_base.py
+++ b/src/curobo/rollout/arm_base.py
@@ -39,7 +39,7 @@ from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutConfig, Rollou
 from curobo.types.base import TensorDeviceType
 from curobo.types.robot import CSpaceConfig, RobotConfig
 from curobo.types.state import JointState
-from curobo.util.logger import log_info, log_warn
+from curobo.util.logger import log_error, log_info, log_warn
 from curobo.util.tensor_util import cat_sum
 
 
@@ -366,6 +366,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
                 )
                 cost_list.append(coll_cost)
         if return_list:
+
             return cost_list
         cost = cat_sum(cost_list)
         return cost
@@ -424,6 +425,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
         out_metrics = self.constraint_fn(state)
         out_metrics.state = state
         out_metrics = self.convergence_fn(state, out_metrics)
+        out_metrics.cost = self.cost_fn(state)
         return out_metrics
 
     def get_metrics_cuda_graph(self, state: JointState):
@@ -451,6 +453,8 @@ class ArmBase(RolloutBase, ArmBaseConfig):
             with torch.cuda.graph(self.cu_metrics_graph, stream=s):
                 self._cu_out_metrics = self.get_metrics(self._cu_metrics_state_in)
             self._metrics_cuda_graph_init = True
+        if self._cu_metrics_state_in.position.shape != state.position.shape:
+            log_error("cuda graph changed")
         self._cu_metrics_state_in.copy_(state)
         self.cu_metrics_graph.replay()
         out_metrics = self._cu_out_metrics
@@ -462,17 +466,6 @@ class ArmBase(RolloutBase, ArmBaseConfig):
     ):
         if out_metrics is None:
             out_metrics = RolloutMetrics()
-        if (
-            self.convergence_cfg.null_space_cfg is not None
-            and self.null_convergence.enabled
-            and self._goal_buffer.batch_retract_state_idx is not None
-        ):
-            out_metrics.cost = self.null_convergence.forward_target_idx(
-                self._goal_buffer.retract_state,
-                state.state_seq.position,
-                self._goal_buffer.batch_retract_state_idx,
-            )
-
         return out_metrics
 
     def _get_augmented_state(self, state: JointState) -> KinematicModelState:
@@ -688,9 +681,11 @@ class ArmBase(RolloutBase, ArmBaseConfig):
         act_seq = self.dynamics_model.init_action_mean.unsqueeze(0).repeat(self.batch_size, 1, 1)
         return act_seq
 
-    def reset_cuda_graph(self):
+    def reset_shape(self):
         self._goal_idx_update = True
+        super().reset_shape()
 
+    def reset_cuda_graph(self):
         super().reset_cuda_graph()
 
     def get_action_from_state(self, state: JointState):
diff --git a/src/curobo/rollout/arm_reacher.py b/src/curobo/rollout/arm_reacher.py
index 2b0a96b..8074d11 100644
--- a/src/curobo/rollout/arm_reacher.py
+++ b/src/curobo/rollout/arm_reacher.py
@@ -20,16 +20,16 @@ import torch.autograd.profiler as profiler
 from curobo.geom.sdf.world import WorldCollision
 from curobo.rollout.cost.cost_base import CostConfig
 from curobo.rollout.cost.dist_cost import DistCost, DistCostConfig
-from curobo.rollout.cost.pose_cost import PoseCost, PoseCostConfig
+from curobo.rollout.cost.pose_cost import PoseCost, PoseCostConfig, PoseCostMetric
 from curobo.rollout.cost.straight_line_cost import StraightLineCost
 from curobo.rollout.cost.zero_cost import ZeroCost
 from curobo.rollout.dynamics_model.kinematic_model import KinematicModelState
 from curobo.rollout.rollout_base import Goal, RolloutMetrics
 from curobo.types.base import TensorDeviceType
 from curobo.types.robot import RobotConfig
-from curobo.types.tensor import T_BValue_float
+from curobo.types.tensor import T_BValue_float, T_BValue_int
 from curobo.util.helpers import list_idx_if_not_none
-from curobo.util.logger import log_info
+from curobo.util.logger import log_error, log_info
 from curobo.util.tensor_util import cat_max, cat_sum
 
 # Local Folder
@@ -42,6 +42,8 @@ class ArmReacherMetrics(RolloutMetrics):
     position_error: Optional[T_BValue_float] = None
     rotation_error: Optional[T_BValue_float] = None
     pose_error: Optional[T_BValue_float] = None
+    goalset_index: Optional[T_BValue_int] = None
+    null_space_error: Optional[T_BValue_float] = None
 
     def __getitem__(self, idx):
         d_list = [
@@ -53,6 +55,8 @@ class ArmReacherMetrics(RolloutMetrics):
             self.position_error,
             self.rotation_error,
             self.pose_error,
+            self.goalset_index,
+            self.null_space_error,
         ]
         idx_vals = list_idx_if_not_none(d_list, idx)
         return ArmReacherMetrics(*idx_vals)
@@ -65,10 +69,14 @@ class ArmReacherMetrics(RolloutMetrics):
             constraint=None if self.constraint is None else self.constraint.clone(),
             feasible=None if self.feasible is None else self.feasible.clone(),
             state=None if self.state is None else self.state,
-            cspace_error=None if self.cspace_error is None else self.cspace_error,
-            position_error=None if self.position_error is None else self.position_error,
-            rotation_error=None if self.rotation_error is None else self.rotation_error,
-            pose_error=None if self.pose_error is None else self.pose_error,
+            cspace_error=None if self.cspace_error is None else self.cspace_error.clone(),
+            position_error=None if self.position_error is None else self.position_error.clone(),
+            rotation_error=None if self.rotation_error is None else self.rotation_error.clone(),
+            pose_error=None if self.pose_error is None else self.pose_error.clone(),
+            goalset_index=None if self.goalset_index is None else self.goalset_index.clone(),
+            null_space_error=(
+                None if self.null_space_error is None else self.null_space_error.clone()
+            ),
         )
 
 
@@ -254,6 +262,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                     goal_cost = self.goal_cost.forward(
                         ee_pos_batch, ee_quat_batch, self._goal_buffer
                     )
+
                 cost_list.append(goal_cost)
         with profiler.record_function("cost/link_poses"):
             if self._goal_buffer.links_goal_pose is not None and self.cost_cfg.pose_cfg is not None:
@@ -296,36 +305,21 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                 g_dist,
             )
 
-            # cost += z_acc
             cost_list.append(z_acc)
-        # print(self.cost_cfg.zero_jerk_cfg)
-        if (
-            self.cost_cfg.zero_jerk_cfg is not None
-            and self.zero_jerk_cost.enabled
-            # and g_dist is not None
-        ):
-            # jerk = self.dynamics_model._aux_matrix @ state_batch.acceleration
+        if self.cost_cfg.zero_jerk_cfg is not None and self.zero_jerk_cost.enabled:
             z_jerk = self.zero_jerk_cost.forward(
                 state_batch.jerk,
                 g_dist,
             )
             cost_list.append(z_jerk)
-            # cost +=  z_jerk
 
-        if (
-            self.cost_cfg.zero_vel_cfg is not None
-            and self.zero_vel_cost.enabled
-            # and g_dist is not None
-        ):
+        if self.cost_cfg.zero_vel_cfg is not None and self.zero_vel_cost.enabled:
             z_vel = self.zero_vel_cost.forward(
                 state_batch.velocity,
                 g_dist,
             )
-            # cost += z_vel
-            # print(z_vel.shape)
             cost_list.append(z_vel)
         cost = cat_sum(cost_list)
-        # print(cost[:].T)
         return cost
 
     def convergence_fn(
@@ -350,6 +344,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
             ) = self.pose_convergence.forward_out_distance(
                 state.ee_pos_seq, state.ee_quat_seq, self._goal_buffer
             )
+            out_metrics.goalset_index = self.pose_convergence.goalset_index_buffer  # .clone()
         if (
             self._goal_buffer.links_goal_pose is not None
             and self.convergence_cfg.pose_cfg is not None
@@ -389,6 +384,17 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                 True,
             )
 
+        if (
+            self.convergence_cfg.null_space_cfg is not None
+            and self.null_convergence.enabled
+            and self._goal_buffer.batch_retract_state_idx is not None
+        ):
+            out_metrics.null_space_error = self.null_convergence.forward_target_idx(
+                self._goal_buffer.retract_state,
+                state.state_seq.position,
+                self._goal_buffer.batch_retract_state_idx,
+            )
+
         return out_metrics
 
     def update_params(
@@ -420,3 +426,43 @@ class ArmReacher(ArmBase, ArmReacherConfig):
         else:
             self.dist_cost.disable_cost()
             self.cspace_convergence.disable_cost()
+
+    def get_pose_costs(self, include_link_pose: bool = False, include_convergence: bool = True):
+        pose_costs = [self.goal_cost]
+        if include_convergence:
+            pose_costs += [self.pose_convergence]
+        if include_link_pose:
+            log_error("Not implemented yet")
+        return pose_costs
+
+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+    ):
+        pose_costs = self.get_pose_costs()
+        if metric.hold_partial_pose:
+            if metric.hold_vec_weight is None:
+                log_error("hold_vec_weight is required")
+            [x.hold_partial_pose(metric.hold_vec_weight) for x in pose_costs]
+        if metric.release_partial_pose:
+            [x.release_partial_pose() for x in pose_costs]
+        if metric.reach_partial_pose:
+            if metric.reach_vec_weight is None:
+                log_error("reach_vec_weight is required")
+            [x.reach_partial_pose(metric.reach_vec_weight) for x in pose_costs]
+        if metric.reach_full_pose:
+            [x.reach_full_pose() for x in pose_costs]
+
+        pose_costs = self.get_pose_costs(include_convergence=False)
+        if metric.remove_offset_waypoint:
+            [x.remove_offset_waypoint() for x in pose_costs]
+
+        if metric.offset_position is not None or metric.offset_rotation is not None:
+            [
+                x.update_offset_waypoint(
+                    offset_position=metric.offset_position,
+                    offset_rotation=metric.offset_rotation,
+                    offset_tstep_fraction=metric.offset_tstep_fraction,
+                )
+                for x in pose_costs
+            ]
diff --git a/src/curobo/rollout/cost/bound_cost.py b/src/curobo/rollout/cost/bound_cost.py
index 7d18efc..ebaf8f5 100644
--- a/src/curobo/rollout/cost/bound_cost.py
+++ b/src/curobo/rollout/cost/bound_cost.py
@@ -257,13 +257,13 @@ class BoundCost(CostBase, BoundCostConfig):
         return cost
 
     def update_dt(self, dt: Union[float, torch.Tensor]):
-        # return super().update_dt(dt)
         if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
             v_scale = dt / self._dt
             a_scale = v_scale**2
             j_scale = v_scale**3
             self.smooth_weight[1] *= a_scale
             self.smooth_weight[2] *= j_scale
+
         return super().update_dt(dt)
 
 
diff --git a/src/curobo/rollout/cost/pose_cost.py b/src/curobo/rollout/cost/pose_cost.py
index ee5ef65..7e1a6e0 100644
--- a/src/curobo/rollout/cost/pose_cost.py
+++ b/src/curobo/rollout/cost/pose_cost.py
@@ -8,19 +8,23 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
+from __future__ import annotations
+
 # Standard Library
+import math
 from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional
 
 # Third Party
 import torch
-from torch.autograd import Function
 
 # CuRobo
-from curobo.curobolib.geom import get_pose_distance, get_pose_distance_backward
+from curobo.curobolib.geom import PoseError, PoseErrorDistance
 from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
 from curobo.types.math import OrientationError, Pose
+from curobo.util.logger import log_error
 
 # Local Folder
 from .cost_base import CostBase, CostConfig
@@ -37,7 +41,11 @@ class PoseErrorType(Enum):
 class PoseCostConfig(CostConfig):
     cost_type: PoseErrorType = PoseErrorType.BATCH_GOAL
     use_metric: bool = False
+    project_distance: bool = True
     run_vec_weight: Optional[List[float]] = None
+    use_projected_distance: bool = True
+    offset_waypoint: List[float] = None
+    offset_tstep_fraction: float = -1.0
 
     def __post_init__(self):
         if self.run_vec_weight is not None:
@@ -54,392 +62,85 @@ class PoseCostConfig(CostConfig):
             self.vec_convergence = torch.zeros(
                 2, device=self.tensor_args.device, dtype=self.tensor_args.dtype
             )
+        if self.offset_waypoint is None:
+            self.offset_waypoint = [0, 0, 0, 0, 0, 0]
+        if self.run_weight is None:
+            self.run_weight = 1
+        self.offset_waypoint = self.tensor_args.to_device(self.offset_waypoint)
+        if isinstance(self.offset_tstep_fraction, float):
+            self.offset_tstep_fraction = self.tensor_args.to_device([self.offset_tstep_fraction])
         return super().__post_init__()
 
 
-@torch.jit.script
-def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):
-    grad_vec = grad_g_dist + (grad_out_distance * weight)
-    grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec
-    return grad
+@dataclass
+class PoseCostMetric:
+    hold_partial_pose: bool = False
+    release_partial_pose: bool = False
+    hold_vec_weight: Optional[torch.Tensor] = None
+    reach_partial_pose: bool = False
+    reach_full_pose: bool = False
+    reach_vec_weight: Optional[torch.Tensor] = None
+    offset_position: Optional[torch.Tensor] = None
+    offset_rotation: Optional[torch.Tensor] = None
+    offset_tstep_fraction: float = -1.0
+    remove_offset_waypoint: bool = False
 
+    def clone(self):
 
-# full method:
-@torch.jit.script
-def backward_full_PoseError_jit(
-    grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
-):
-    p_grad = (grad_g_dist + (grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
-    q_grad = (grad_r_err + (grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
-    # p_grad = ((grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
-    # q_grad = ((grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
-
-    return p_grad, q_grad
-
-
-class PoseErrorDistance(Function):
-    @staticmethod
-    def forward(
-        ctx,
-        current_position,
-        goal_position,
-        current_quat,
-        goal_quat,
-        vec_weight,
-        weight,
-        vec_convergence,
-        run_weight,
-        run_vec_weight,
-        batch_pose_idx,
-        out_distance,
-        out_position_distance,
-        out_rotation_distance,
-        out_p_vec,
-        out_r_vec,
-        out_idx,
-        out_p_grad,
-        out_q_grad,
-        batch_size,
-        horizon,
-        mode=PoseErrorType.BATCH_GOAL.value,
-        num_goals=1,
-        use_metric=False,
-    ):
-        # out_distance = current_position[..., 0].detach().clone() * 0.0
-        # out_position_distance = out_distance.detach().clone()
-        # out_rotation_distance = out_distance.detach().clone()
-        # out_vec = (
-        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
-        #    * 0.0
-        # )
-        # out_idx = out_distance.clone().to(dtype=torch.long)
-
-        (
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-        ) = get_pose_distance(
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-            current_position.contiguous(),
-            goal_position,
-            current_quat.contiguous(),
-            goal_quat,
-            vec_weight,
-            weight,
-            vec_convergence,
-            run_weight,
-            run_vec_weight,
-            batch_pose_idx,
-            batch_size,
-            horizon,
-            mode,
-            num_goals,
-            current_position.requires_grad,
-            True,
-            use_metric,
-        )
-        ctx.save_for_backward(out_p_vec, out_r_vec, weight, out_p_grad, out_q_grad)
-        return out_distance, out_position_distance, out_rotation_distance, out_idx  # .view(-1,1)
-
-    @staticmethod
-    def backward(ctx, grad_out_distance, grad_g_dist, grad_r_err, grad_out_idx):
-        (g_vec_p, g_vec_q, weight, out_grad_p, out_grad_q) = ctx.saved_tensors
-        pos_grad = None
-        quat_grad = None
-        batch_size = g_vec_p.shape[0] * g_vec_p.shape[1]
-        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
-            pos_grad, quat_grad = get_pose_distance_backward(
-                out_grad_p,
-                out_grad_q,
-                grad_out_distance.contiguous(),
-                grad_g_dist.contiguous(),
-                grad_r_err.contiguous(),
-                weight,
-                g_vec_p,
-                g_vec_q,
-                batch_size,
-                use_distance=True,
-            )
-            # pos_grad, quat_grad = backward_full_PoseError_jit(
-            #    grad_out_distance,
-            #    grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
-            # )
-        elif ctx.needs_input_grad[0]:
-            pos_grad = backward_PoseError_jit(grad_g_dist, grad_out_distance, p_w, g_vec_p)
-            # grad_vec = grad_g_dist + (grad_out_distance * weight[1])
-            # pos_grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec[..., 4:]
-        elif ctx.needs_input_grad[2]:
-            quat_grad = backward_PoseError_jit(grad_r_err, grad_out_distance, q_w, g_vec_q)
-            # grad_vec = grad_r_err + (grad_out_distance * weight[0])
-            # quat_grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec[..., :4]
-        return (
-            pos_grad,
-            None,
-            quat_grad,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+        return PoseCostMetric(
+            hold_partial_pose=self.hold_partial_pose,
+            release_partial_pose=self.release_partial_pose,
+            hold_vec_weight=None if self.hold_vec_weight is None else self.hold_vec_weight.clone(),
+            reach_partial_pose=self.reach_partial_pose,
+            reach_full_pose=self.reach_full_pose,
+            reach_vec_weight=(
+                None if self.reach_vec_weight is None else self.reach_vec_weight.clone()
+            ),
+            offset_position=None if self.offset_position is None else self.offset_position.clone(),
+            offset_rotation=None if self.offset_rotation is None else self.offset_rotation.clone(),
+            offset_tstep_fraction=self.offset_tstep_fraction,
+            remove_offset_waypoint=self.remove_offset_waypoint,
         )
 
+    @classmethod
+    def create_grasp_approach_metric(
+        cls,
+        offset_position: float = 0.5,
+        linear_axis: int = 2,
+        tstep_fraction: float = 0.6,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> PoseCostMetric:
+        """Enables moving to a pregrasp and then locked orientation movement to final grasp.
 
-class PoseLoss(Function):
-    @staticmethod
-    def forward(
-        ctx,
-        current_position,
-        goal_position,
-        current_quat,
-        goal_quat,
-        vec_weight,
-        weight,
-        vec_convergence,
-        run_weight,
-        run_vec_weight,
-        batch_pose_idx,
-        out_distance,
-        out_position_distance,
-        out_rotation_distance,
-        out_p_vec,
-        out_r_vec,
-        out_idx,
-        out_p_grad,
-        out_q_grad,
-        batch_size,
-        horizon,
-        mode=PoseErrorType.BATCH_GOAL.value,
-        num_goals=1,
-        use_metric=False,
-    ):
-        # out_distance = current_position[..., 0].detach().clone() * 0.0
-        # out_position_distance = out_distance.detach().clone()
-        # out_rotation_distance = out_distance.detach().clone()
-        # out_vec = (
-        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
-        #    * 0.0
-        # )
-        # out_idx = out_distance.clone().to(dtype=torch.long)
+        Since this is added as a cost, the trajectory will not reach the exact offset, instead it
+        will try to take a blended path to the final grasp without stopping at the offset.
 
-        (
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-        ) = get_pose_distance(
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-            current_position.contiguous(),
-            goal_position,
-            current_quat.contiguous(),
-            goal_quat,
-            vec_weight,
-            weight,
-            vec_convergence,
-            run_weight,
-            run_vec_weight,
-            batch_pose_idx,
-            batch_size,
-            horizon,
-            mode,
-            num_goals,
-            current_position.requires_grad,
-            False,
-            use_metric,
-        )
-        ctx.save_for_backward(out_p_vec, out_r_vec)
-        return out_distance
+        Args:
+            offset_position: offset in meters.
+            linear_axis: specifies the x y or z axis.
+            tstep_fraction:  specifies the timestep fraction to start activating this constraint.
+            tensor_args: cuda device.
 
-    @staticmethod
-    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
-        pos_grad = None
-        quat_grad = None
-        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-            pos_grad = g_vec_p * grad_out_distance.unsqueeze(1)
-            quat_grad = g_vec_q * grad_out_distance.unsqueeze(1)
-            pos_grad = pos_grad.unsqueeze(-2)
-            quat_grad = quat_grad.unsqueeze(-2)
-        elif ctx.needs_input_grad[0]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            pos_grad = g_vec_p * grad_out_distance.unsqueeze(1)
-            pos_grad = pos_grad.unsqueeze(-2)
-        elif ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            quat_grad = g_vec_q * grad_out_distance.unsqueeze(1)
-            quat_grad = quat_grad.unsqueeze(-2)
-
-        return (
-            pos_grad,
-            None,
-            quat_grad,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+        Returns:
+            cost metric.
+        """
+        hold_vec_weight = tensor_args.to_device([1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+        hold_vec_weight[3 + linear_axis] = 0.0
+        offset_position_vec = tensor_args.to_device([0.0, 0.0, 0.0])
+        offset_position_vec[linear_axis] = offset_position
+        return cls(
+            hold_partial_pose=True,
+            hold_vec_weight=hold_vec_weight,
+            offset_position=offset_position_vec,
+            offset_tstep_fraction=tstep_fraction,
         )
 
-
-class PoseError(Function):
-    @staticmethod
-    def forward(
-        ctx,
-        current_position,
-        goal_position,
-        current_quat,
-        goal_quat,
-        vec_weight,
-        weight,
-        vec_convergence,
-        run_weight,
-        run_vec_weight,
-        batch_pose_idx,
-        out_distance,
-        out_position_distance,
-        out_rotation_distance,
-        out_p_vec,
-        out_r_vec,
-        out_idx,
-        out_p_grad,
-        out_q_grad,
-        batch_size,
-        horizon,
-        mode=PoseErrorType.BATCH_GOAL.value,
-        num_goals=1,
-        use_metric=False,
-    ):
-        # out_distance = current_position[..., 0].detach().clone() * 0.0
-        # out_position_distance = out_distance.detach().clone()
-        # out_rotation_distance = out_distance.detach().clone()
-        # out_vec = (
-        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
-        #    * 0.0
-        # )
-        # out_idx = out_distance.clone().to(dtype=torch.long)
-
-        (
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-        ) = get_pose_distance(
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-            current_position.contiguous(),
-            goal_position,
-            current_quat.contiguous(),
-            goal_quat,
-            vec_weight,
-            weight,
-            vec_convergence,
-            run_weight,
-            run_vec_weight,
-            batch_pose_idx,
-            batch_size,
-            horizon,
-            mode,
-            num_goals,
-            current_position.requires_grad,
-            False,
-            use_metric,
-        )
-        ctx.save_for_backward(out_p_vec, out_r_vec)
-        return out_distance
-
-    @staticmethod
-    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
-        pos_grad = None
-        quat_grad = None
-        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            pos_grad = g_vec_p
-            quat_grad = g_vec_q
-        elif ctx.needs_input_grad[0]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            pos_grad = g_vec_p
-        elif ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            quat_grad = g_vec_q
-        return (
-            pos_grad,
-            None,
-            quat_grad,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+    @classmethod
+    def reset_metric(cls) -> PoseCostMetric:
+        return PoseCostMetric(
+            remove_offset_waypoint=True,
+            reach_full_pose=True,
+            release_partial_pose=True,
         )
 
 
@@ -449,13 +150,88 @@ class PoseCost(CostBase, PoseCostConfig):
         CostBase.__init__(self)
         self.rot_weight = self.vec_weight[0:3]
         self.pos_weight = self.vec_weight[3:6]
-
         self._vec_convergence = self.tensor_args.to_device(self.vec_convergence)
         self._batch_size = 0
         self._horizon = 0
 
+    def update_metric(self, metric: PoseCostMetric):
+        if metric.hold_partial_pose:
+            if metric.hold_vec_weight is None:
+                log_error("hold_vec_weight is required")
+            self.hold_partial_pose(metric.hold_vec_weight)
+        if metric.release_partial_pose:
+            self.release_partial_pose()
+        if metric.reach_partial_pose:
+            if metric.reach_vec_weight is None:
+                log_error("reach_vec_weight is required")
+            self.reach_partial_pose(metric.reach_vec_weight)
+        if metric.reach_full_pose:
+            self.reach_full_pose()
+
+        if metric.remove_offset_waypoint:
+            self.remove_offset_waypoint()
+
+        if metric.offset_position is not None or metric.offset_rotation is not None:
+            self.update_offset_waypoint(
+                offset_position=self.offset_position,
+                offset_rotation=self.offset_rotation,
+                offset_tstep_fraction=self.offset_tstep_fraction,
+            )
+
+    def hold_partial_pose(self, run_vec_weight: torch.Tensor):
+
+        self.run_vec_weight.copy_(run_vec_weight)
+
+    def release_partial_pose(self):
+        self.run_vec_weight[:] = 0.0
+
+    def reach_partial_pose(self, vec_weight: torch.Tensor):
+        self.vec_weight[:] = vec_weight
+
+    def reach_full_pose(self):
+        self.vec_weight[:] = 1.0
+
+    def update_offset_waypoint(
+        self,
+        offset_position: Optional[torch.Tensor] = None,
+        offset_rotation: Optional[torch.Tensor] = None,
+        offset_tstep_fraction: float = 0.75,
+    ):
+        if offset_position is not None:
+            self.offset_waypoint[3:].copy_(offset_position)
+        if offset_rotation is not None:
+            self.offset_waypoint[:3].copy_(offset_rotation)
+        self.offset_tstep_fraction[:] = offset_tstep_fraction
+        if self._horizon <= 0:
+            print(self.weight)
+            log_error(
+                "Updating offset waypoint is only possible after initializing motion gen"
+                + " run motion_gen.warmup() before adding offset_waypoint"
+            )
+        self.update_run_weight(run_tstep_fraction=offset_tstep_fraction)
+
+    def remove_offset_waypoint(self):
+        self.offset_tstep_fraction[:] = -1.0
+        self.update_run_weight()
+
+    def update_run_weight(
+        self, run_tstep_fraction: float = 0.0, run_weight: Optional[float] = None
+    ):
+        if self._horizon == 1:
+            return
+
+        if run_weight is None:
+            run_weight = self.run_weight
+
+        active_steps = math.floor(self._horizon * run_tstep_fraction)
+        self._run_weight_vec[:, :active_steps] = 0
+        self._run_weight_vec[:, active_steps:-1] = run_weight
+
     def update_batch_size(self, batch_size, horizon):
         if batch_size != self._batch_size or horizon != self._horizon:
+            # print(self.weight)
+            # print(batch_size, horizon, self._batch_size, self._horizon)
+
             # batch_size = b*h
             self.out_distance = torch.zeros(
                 (batch_size, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
@@ -493,12 +269,16 @@ class PoseCost(CostBase, PoseCostConfig):
                 self._run_weight_vec = torch.ones(
                     (1, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
                 )
-            if self.terminal and self.run_weight is not None:
-                self._run_weight_vec[:, :-1] *= self.run_weight
+            if self.terminal and self.run_weight is not None and horizon > 1:
+                self._run_weight_vec[:, :-1] = self.run_weight
 
             self._batch_size = batch_size
             self._horizon = horizon
 
+    @property
+    def goalset_index_buffer(self):
+        return self.out_idx
+
     def _forward_goal_distribution(self, ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot):
         ee_goal_pos = ee_goal_pos.unsqueeze(1)
         ee_goal_pos = ee_goal_pos.unsqueeze(1)
@@ -563,13 +343,13 @@ class PoseCost(CostBase, PoseCostConfig):
     def _update_cost_type(self, ee_goal_pos, ee_pos_batch, num_goals):
         d_g = len(ee_goal_pos.shape)
         b_sze = ee_goal_pos.shape[0]
-        if d_g == 2 and b_sze == 1:  # 1, 3
+        if d_g == 2 and b_sze == 1:  # [1, 3]
             self.cost_type = PoseErrorType.SINGLE_GOAL
-        elif d_g == 2 and b_sze == ee_pos_batch.shape[0]:  # b, 3
+        elif d_g == 2 and b_sze > 1:  # [b, 3]
             self.cost_type = PoseErrorType.BATCH_GOAL
-        elif d_g == 3:
+        elif d_g == 3 and b_sze == 1:  # [1, goalset, 3]
             self.cost_type = PoseErrorType.GOALSET
-        elif len(ee_goal_pos.shape) == 4 and b_sze == ee_pos_bath.shape[0]:
+        elif d_g == 3 and b_sze > 1:  # [b, goalset,3]
             self.cost_type = PoseErrorType.BATCH_GOALSET
 
     def forward_out_distance(
@@ -599,6 +379,8 @@ class PoseCost(CostBase, PoseCostConfig):
             self._vec_convergence,
             self._run_weight_vec,
             self.run_vec_weight,
+            self.offset_waypoint,
+            self.offset_tstep_fraction,
             goal.batch_pose_idx,
             self.out_distance,
             self.out_position_distance,
@@ -613,7 +395,9 @@ class PoseCost(CostBase, PoseCostConfig):
             self.cost_type.value,
             num_goals,
             self.use_metric,
+            self.project_distance,
         )
+        # print(self.out_idx.shape, self.out_idx[:,-1])
         # print(goal.batch_pose_idx.shape)
         cost = distance  # .view(b, h)#.clone()
         r_err = r_err  # .view(b, h)
@@ -632,65 +416,46 @@ class PoseCost(CostBase, PoseCostConfig):
         ee_goal_rot = goal_pose.quaternion
         num_goals = goal_pose.n_goalset
         self._update_cost_type(ee_goal_pos, ee_pos_batch, num_goals)
-
+        # print(self.cost_type)
         b, h, _ = ee_pos_batch.shape
         self.update_batch_size(b, h)
         # return self.out_distance
         # print(b,h, ee_goal_pos.shape)
-        if self.return_loss:
-            distance = PoseLoss.apply(
-                ee_pos_batch,
-                ee_goal_pos,
-                ee_rot_batch,  # .view(-1, 4).contiguous(),
-                ee_goal_rot,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                goal.batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
-        else:
-            distance = PoseError.apply(
-                ee_pos_batch,
-                ee_goal_pos,
-                ee_rot_batch,  # .view(-1, 4).contiguous(),
-                ee_goal_rot,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                goal.batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
+
+        distance = PoseError.apply(
+            ee_pos_batch,
+            ee_goal_pos,
+            ee_rot_batch,  # .view(-1, 4).contiguous(),
+            ee_goal_rot,
+            self.vec_weight,
+            self.weight,
+            self._vec_convergence,
+            self._run_weight_vec,
+            self.run_vec_weight,
+            self.offset_waypoint,
+            self.offset_tstep_fraction,
+            goal.batch_pose_idx,
+            self.out_distance,
+            self.out_position_distance,
+            self.out_rotation_distance,
+            self.out_p_vec,
+            self.out_q_vec,
+            self.out_idx,
+            self.out_p_grad,
+            self.out_q_grad,
+            b,
+            h,
+            self.cost_type.value,
+            num_goals,
+            self.use_metric,
+            self.project_distance,
+            self.return_loss,
+        )
 
         cost = distance
+        # if link_name is None and cost.shape[0]==8:
+        #    print(ee_pos_batch[...,-1].squeeze())
+        # print(cost.shape)
         return cost
 
     def forward_pose(
@@ -708,56 +473,34 @@ class PoseCost(CostBase, PoseCostConfig):
         b = query_pose.position.shape[0]
         h = query_pose.position.shape[1]
         num_goals = 1
-        if self.return_loss:
-            distance = PoseLoss.apply(
-                query_pose.position.unsqueeze(1),
-                ee_goal_pos,
-                query_pose.quaternion.unsqueeze(1),
-                ee_goal_quat,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
-        else:
-            distance = PoseError.apply(
-                query_pose.position.unsqueeze(1),
-                ee_goal_pos,
-                query_pose.quaternion.unsqueeze(1),
-                ee_goal_quat,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
+
+        distance = PoseError.apply(
+            query_pose.position.unsqueeze(1),
+            ee_goal_pos,
+            query_pose.quaternion.unsqueeze(1),
+            ee_goal_quat,
+            self.vec_weight,
+            self.weight,
+            self._vec_convergence,
+            self._run_weight_vec,
+            self.run_vec_weight,
+            self.offset_waypoint,
+            self.offset_tstep_fraction,
+            batch_pose_idx,
+            self.out_distance,
+            self.out_position_distance,
+            self.out_rotation_distance,
+            self.out_p_vec,
+            self.out_q_vec,
+            self.out_idx,
+            self.out_p_grad,
+            self.out_q_grad,
+            b,
+            h,
+            self.cost_type.value,
+            num_goals,
+            self.use_metric,
+            self.project_distance,
+            self.return_loss,
+        )
         return distance
diff --git a/src/curobo/rollout/cost/stop_cost.py b/src/curobo/rollout/cost/stop_cost.py
index f2aa54c..9a1cbbb 100644
--- a/src/curobo/rollout/cost/stop_cost.py
+++ b/src/curobo/rollout/cost/stop_cost.py
@@ -68,9 +68,14 @@ class StopCost(CostBase, StopCostConfig):
             self.max_vel = (sum_matrix @ delta_vel).unsqueeze(-1)
 
     def forward(self, vels):
-        vel_abs = torch.abs(vels)
-        vel_abs = torch.nn.functional.relu(vel_abs - self.max_vel)
-
-        cost = self.weight * (torch.sum(vel_abs**2, dim=-1))
-
+        cost = velocity_cost(vels, self.weight, self.max_vel)
         return cost
+
+
+@torch.jit.script
+def velocity_cost(vels, weight, max_vel):
+    vel_abs = torch.abs(vels)
+    vel_abs = torch.nn.functional.relu(vel_abs - max_vel[: vels.shape[1]])
+    cost = weight * (torch.sum(vel_abs**2, dim=-1))
+
+    return cost
diff --git a/src/curobo/rollout/rollout_base.py b/src/curobo/rollout/rollout_base.py
index a5edb46..31cdf9f 100644
--- a/src/curobo/rollout/rollout_base.py
+++ b/src/curobo/rollout/rollout_base.py
@@ -23,7 +23,7 @@ import torch
 # CuRobo
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
-from curobo.types.robot import CSpaceConfig, State, JointState
+from curobo.types.robot import CSpaceConfig, State
 from curobo.types.tensor import (
     T_BDOF,
     T_DOF,
@@ -33,6 +33,7 @@ from curobo.types.tensor import (
     T_BValue_float,
 )
 from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.logger import log_info
 from curobo.util.sample_lib import HaltonGenerator
 from curobo.util.tensor_util import copy_tensor
 
@@ -235,6 +236,7 @@ class Goal(Sequence):
             batch_retract_state_idx=self.batch_retract_state_idx,
             batch_goal_state_idx=self.batch_goal_state_idx,
             links_goal_pose=self.links_goal_pose,
+            n_goalset=self.n_goalset,
         )
 
     def _tensor_repeat_seeds(self, tensor, num_seeds):
@@ -353,7 +355,7 @@ class Goal(Sequence):
 
     def _copy_tensor(self, ref_buffer, buffer):
         if buffer is not None:
-            if ref_buffer is not None:
+            if ref_buffer is not None and buffer.shape == ref_buffer.shape:
                 if not copy_tensor(buffer, ref_buffer):
                     ref_buffer = buffer.clone()
             else:
@@ -553,6 +555,10 @@ class RolloutBase:
         self._rollout_constraint_cuda_graph_init = False
         if self.cu_rollout_constraint_graph is not None:
             self.cu_rollout_constraint_graph.reset()
+        self.reset_shape()
+
+    def reset_shape(self):
+        pass
 
     @abstractmethod
     def get_action_from_state(self, state: State):
diff --git a/src/curobo/types/camera.py b/src/curobo/types/camera.py
index 9f48ede..0febe84 100644
--- a/src/curobo/types/camera.py
+++ b/src/curobo/types/camera.py
@@ -16,10 +16,15 @@ from typing import List, Optional
 
 # Third Party
 import torch
+from torch.profiler import record_function
 
 # CuRobo
+from curobo.geom.cv import (
+    get_projection_rays,
+    project_depth_using_rays,
+    project_pointcloud_to_depth,
+)
 from curobo.types.math import Pose
-from curobo.util.logger import log_error, log_warn
 
 
 @dataclass
@@ -30,25 +35,82 @@ class CameraObservation:
     depth_image: Optional[torch.Tensor] = None
     image_segmentation: Optional[torch.Tensor] = None
     projection_matrix: Optional[torch.Tensor] = None
+    projection_rays: Optional[torch.Tensor] = None
     resolution: Optional[List[int]] = None
     pose: Optional[Pose] = None
     intrinsics: Optional[torch.Tensor] = None
     timestamp: float = 0.0
 
+    def filter_depth(self, distance: float = 0.01):
+        self.depth_image = torch.where(self.depth_image < distance, 0, self.depth_image)
+
     @property
     def shape(self):
         return self.rgb_image.shape
 
+    @record_function("camera/copy_")
     def copy_(self, new_data: CameraObservation):
-        self.rgb_image.copy_(new_data.rgb_image)
-        self.depth_image.copy_(new_data.depth_image)
-        self.image_segmentation.copy_(new_data.image_segmentation)
-        self.projection_matrix.copy_(new_data.projection_matrix)
+        if self.rgb_image is not None:
+            self.rgb_image.copy_(new_data.rgb_image)
+        if self.depth_image is not None:
+            self.depth_image.copy_(new_data.depth_image)
+        if self.image_segmentation is not None:
+            self.image_segmentation.copy_(new_data.image_segmentation)
+        if self.projection_matrix is not None:
+            self.projection_matrix.copy_(new_data.projection_matrix)
+        if self.projection_rays is not None:
+            self.projection_rays.copy_(new_data.projection_rays)
+        if self.pose is not None:
+            self.pose.copy_(new_data.pose)
         self.resolution = new_data.resolution
 
+    @record_function("camera/clone")
+    def clone(self):
+
+        return CameraObservation(
+            depth_image=self.depth_image.clone() if self.depth_image is not None else None,
+            rgb_image=self.rgb_image.clone() if self.rgb_image is not None else None,
+            intrinsics=self.intrinsics.clone() if self.intrinsics is not None else None,
+            resolution=self.resolution,
+            pose=self.pose.clone() if self.pose is not None else None,
+        )
+
     def to(self, device: torch.device):
         if self.rgb_image is not None:
             self.rgb_image = self.rgb_image.to(device=device)
         if self.depth_image is not None:
             self.depth_image = self.depth_image.to(device=device)
         return self
+
+    def get_pointcloud(self):
+        if self.projection_rays is None:
+            self.update_projection_rays()
+        depth_image = self.depth_image
+        if len(self.depth_image.shape) == 2:
+            depth_image = self.depth_image.unsqueeze(0)
+        point_cloud = project_depth_using_rays(depth_image, self.projection_rays)
+        return point_cloud
+
+    def get_image_from_pointcloud(self, pointcloud, output_image: Optional[torch.Tensor] = None):
+        if output_image is None:
+            output_image = torch.zeros(
+                (self.depth_image.shape[0], self.depth_image.shape[1]),
+                dtype=pointcloud.dtype,
+                device=pointcloud.device,
+            )
+
+        depth_image = project_pointcloud_to_depth(pointcloud, output_image=output_image)
+        return depth_image
+
+    def update_projection_rays(self):
+        intrinsics = self.intrinsics
+        if len(intrinsics.shape) == 2:
+            intrinsics = intrinsics.unsqueeze(0)
+        project_rays = get_projection_rays(
+            self.depth_image.shape[-2], self.depth_image.shape[-1], intrinsics
+        )
+
+        if self.projection_rays is None:
+            self.projection_rays = project_rays
+
+        self.projection_rays.copy_(project_rays)
diff --git a/src/curobo/types/math.py b/src/curobo/types/math.py
index 0e8f16c..4a80d4c 100644
--- a/src/curobo/types/math.py
+++ b/src/curobo/types/math.py
@@ -19,6 +19,7 @@ import numpy as np
 import torch
 import torch.autograd.profiler as profiler
 from torch.autograd import Function
+from torch.profiler import record_function
 
 # CuRobo
 from curobo.geom.transform import (
@@ -33,7 +34,7 @@ from curobo.geom.transform import (
 from curobo.types.base import TensorDeviceType
 from curobo.util.helpers import list_idx_if_not_none
 from curobo.util.logger import log_error, log_info, log_warn
-from curobo.util.tensor_util import copy_if_not_none, copy_tensor
+from curobo.util.tensor_util import clone_if_not_none, copy_tensor
 
 # Local Folder
 from .tensor import T_BPosition, T_BQuaternion, T_BRotation
@@ -256,10 +257,10 @@ class Pose(Sequence):
 
     def clone(self):
         return Pose(
-            position=copy_if_not_none(self.position),
-            quaternion=copy_if_not_none(self.quaternion),
+            position=clone_if_not_none(self.position),
+            quaternion=clone_if_not_none(self.quaternion),
             normalize_rotation=False,
-            # rotation=copy_if_not_none(self.rotation),
+            # rotation=clone_if_not_none(self.rotation),
         )
 
     def to(self, tensor_args: TensorDeviceType):
@@ -408,6 +409,7 @@ class Pose(Sequence):
             gpt_out,
         )
 
+    @record_function("math/pose/transform_points")
     def batch_transform_points(
         self,
         points: torch.Tensor,
@@ -432,6 +434,12 @@ class Pose(Sequence):
     def shape(self):
         return self.position.shape
 
+    def compute_offset_pose(self, offset: Pose) -> Pose:
+        return self.multiply(offset)
+
+    def compute_local_pose(self, world_pose: Pose) -> Pose:
+        return self.inverse().multiply(world_pose)
+
 
 def quat_multiply(q1, q2, q_res):
     a_w = q1[..., 0]
diff --git a/src/curobo/types/state.py b/src/curobo/types/state.py
index aa051e9..78e3f6f 100644
--- a/src/curobo/types/state.py
+++ b/src/curobo/types/state.py
@@ -25,7 +25,7 @@ from curobo.types.tensor import T_BDOF, T_DOF
 from curobo.util.logger import log_error, log_info, log_warn
 from curobo.util.tensor_util import (
     check_tensor_shapes,
-    copy_if_not_none,
+    clone_if_not_none,
     copy_tensor,
     fd_tensor,
     tensor_repeat_seeds,
@@ -121,12 +121,14 @@ class JointState(State):
     def repeat_seeds(self, num_seeds: int):
         return JointState(
             position=tensor_repeat_seeds(self.position, num_seeds),
-            velocity=tensor_repeat_seeds(self.velocity, num_seeds)
-            if self.velocity is not None
-            else None,
-            acceleration=tensor_repeat_seeds(self.acceleration, num_seeds)
-            if self.acceleration is not None
-            else None,
+            velocity=(
+                tensor_repeat_seeds(self.velocity, num_seeds) if self.velocity is not None else None
+            ),
+            acceleration=(
+                tensor_repeat_seeds(self.acceleration, num_seeds)
+                if self.acceleration is not None
+                else None
+            ),
             joint_names=self.joint_names,
         )
 
@@ -153,10 +155,10 @@ class JointState(State):
         if self.joint_names is not None:
             j_names = self.joint_names.copy()
         return JointState(
-            position=copy_if_not_none(self.position),
-            velocity=copy_if_not_none(self.velocity),
-            acceleration=copy_if_not_none(self.acceleration),
-            jerk=copy_if_not_none(self.jerk),
+            position=clone_if_not_none(self.position),
+            velocity=clone_if_not_none(self.velocity),
+            acceleration=clone_if_not_none(self.acceleration),
+            jerk=clone_if_not_none(self.jerk),
             joint_names=j_names,
             tensor_args=self.tensor_args,
         )
diff --git a/src/curobo/util/metrics.py b/src/curobo/util/metrics.py
new file mode 100644
index 0000000..208eaf1
--- /dev/null
+++ b/src/curobo/util/metrics.py
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional
+
+# Third Party
+import numpy as np
+
+try:
+    # Third Party
+    from robometrics.statistics import (
+        Statistic,
+        TrajectoryGroupMetrics,
+        TrajectoryMetrics,
+        percent_true,
+    )
+except ImportError:
+    raise ImportError(
+        "Benchmarking library not found, pip install "
+        + '"robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"'
+    )
+
+
+@dataclass
+class CuroboMetrics(TrajectoryMetrics):
+    time: float = np.inf
+    cspace_path_length: float = 0.0
+    perception_success: bool = False
+    perception_interpolated_success: bool = False
+    jerk: float = np.inf
+
+
+@dataclass
+class CuroboGroupMetrics(TrajectoryGroupMetrics):
+    time: float = np.inf
+    cspace_path_length: Optional[Statistic] = None
+    perception_success: float = 0.0
+    perception_interpolated_success: float = 0.0
+    jerk: float = np.inf
+
+    @classmethod
+    def from_list(cls, group: List[CuroboMetrics]):
+        unskipped = [m for m in group if not m.skip]
+        successes = [m for m in unskipped if m.success]
+        data = super().from_list(group)
+        data.time = Statistic.from_list([m.time for m in successes])
+        data.cspace_path_length = Statistic.from_list([m.cspace_path_length for m in successes])
+        data.perception_success = percent_true([m.perception_success for m in group])
+        data.perception_interpolated_success = percent_true(
+            [m.perception_interpolated_success for m in group]
+        )
+        data.jerk = Statistic.from_list([m.jerk for m in successes])
+
+        return data
diff --git a/src/curobo/util/sample_lib.py b/src/curobo/util/sample_lib.py
index b5d3604..8db2bb2 100644
--- a/src/curobo/util/sample_lib.py
+++ b/src/curobo/util/sample_lib.py
@@ -111,7 +111,7 @@ class HaltonSampleLib(BaseSampleLib):
         super().__init__(sample_config)
         # create halton generator:
         self.halton_generator = HaltonGenerator(
-            self.ndims, seed=self.seed, tensor_args=self.tensor_args
+            self.d_action, seed=self.seed, tensor_args=self.tensor_args
         )
 
     def get_samples(self, sample_shape, base_seed=None, filter_smooth=False, **kwargs):
@@ -122,8 +122,10 @@ class HaltonSampleLib(BaseSampleLib):
             seed = self.seed if base_seed is None else base_seed
             self.sample_shape = sample_shape
             self.seed = seed
-            self.samples = self.halton_generator.get_gaussian_samples(sample_shape[0])
-            self.samples = self.samples.view(self.samples.shape[0], self.horizon, self.d_action)
+            self.samples = self.halton_generator.get_gaussian_samples(
+                sample_shape[0] * self.horizon
+            )
+            self.samples = self.samples.view(sample_shape[0], self.horizon, self.d_action)
 
             if filter_smooth:
                 self.samples = self.filter_smooth(self.samples)
@@ -301,7 +303,7 @@ class StompSampleLib(BaseSampleLib):
 
         self.filter_coeffs = None
         self.halton_generator = HaltonGenerator(
-            self.ndims, seed=self.seed, tensor_args=self.tensor_args
+            self.d_action, seed=self.seed, tensor_args=self.tensor_args
         )
 
     def get_samples(self, sample_shape, base_seed=None, **kwargs):
@@ -314,7 +316,10 @@ class StompSampleLib(BaseSampleLib):
 
             # self.seed = seed
             # torch.manual_seed(self.seed)
-            halton_samples = self.halton_generator.get_gaussian_samples(sample_shape[0])
+            halton_samples = self.halton_generator.get_gaussian_samples(
+                sample_shape[0] * self.horizon
+            ).view(sample_shape[0], self.horizon * self.d_action)
+
             halton_samples = (
                 self.stomp_scale_tril.unsqueeze(0) @ halton_samples.unsqueeze(-1)
             ).squeeze(-1)
@@ -415,7 +420,7 @@ class HaltonGenerator:
         up_bounds=[1],
         low_bounds=[0],
         seed=123,
-        store_buffer: Optional[int] = 1000,
+        store_buffer: Optional[int] = 2000,
     ):
         self._seed = seed
         self.tensor_args = tensor_args
@@ -499,7 +504,7 @@ class HaltonGenerator:
     @profiler.record_function("halton_generator/gaussian_samples")
     def get_gaussian_samples(self, num_samples, variance=1.0):
         std_dev = np.sqrt(variance)
-        uniform_samples = self.get_samples(num_samples)
+        uniform_samples = self.get_samples(num_samples, False)
         gaussian_halton_samples = gaussian_transform(
             uniform_samples, self.proj_mat, self.i_mat, std_dev
         )
@@ -508,7 +513,7 @@ class HaltonGenerator:
 
 @torch.jit.script
 def gaussian_transform(
-    uniform_samples: torch.Tensor, proj_mat: torch.Tensor, i_mat: torch.Tensor, variance: float
+    uniform_samples: torch.Tensor, proj_mat: torch.Tensor, i_mat: torch.Tensor, std_dev: float
 ):
     """Compute a guassian transform of uniform samples.
 
@@ -525,7 +530,7 @@ def gaussian_transform(
     # these values.
     changed_samples = 1.99 * uniform_samples - 0.99
     gaussian_halton_samples = proj_mat * torch.erfinv(changed_samples)
-    i_mat = i_mat * variance
+    i_mat = i_mat * std_dev
     gaussian_halton_samples = torch.matmul(gaussian_halton_samples, i_mat)
     return gaussian_halton_samples
 
diff --git a/src/curobo/util/tensor_util.py b/src/curobo/util/tensor_util.py
index efd53b5..6cec216 100644
--- a/src/curobo/util/tensor_util.py
+++ b/src/curobo/util/tensor_util.py
@@ -31,11 +31,29 @@ def copy_tensor(new_tensor: torch.Tensor, mem_tensor: torch.Tensor):
     return False
 
 
-def copy_if_not_none(x):
+def copy_if_not_none(new_tensor, ref_tensor):
     """Clones x if it's not None.
     TODO: Rename this to clone_if_not_none
 
 
+    Args:
+        x (torch.Tensor): _description_
+
+    Returns:
+        _type_: _description_
+    """
+    if ref_tensor is not None and new_tensor is not None:
+        ref_tensor.copy_(new_tensor)
+    elif ref_tensor is None and new_tensor is not None:
+        ref_tensor = new_tensor
+
+    return ref_tensor
+
+
+def clone_if_not_none(x):
+    """Clones x if it's not None.
+
+
     Args:
         x (torch.Tensor): _description_
 
diff --git a/src/curobo/util/torch_utils.py b/src/curobo/util/torch_utils.py
index d81f176..ae221d5 100644
--- a/src/curobo/util/torch_utils.py
+++ b/src/curobo/util/torch_utils.py
@@ -34,3 +34,10 @@ def is_cuda_graph_available():
         log_warn("WARNING: Disabling CUDA Graph as pytorch < 1.10")
         return False
     return True
+
+
+def is_torch_compile_available():
+    if version.parse(torch.__version__) < version.parse("2.0"):
+        log_warn("WARNING: Disabling compile as pytorch < 2.0")
+        return False
+    return True
diff --git a/src/curobo/util/usd_helper.py b/src/curobo/util/usd_helper.py
index d8499a7..d53d396 100644
--- a/src/curobo/util/usd_helper.py
+++ b/src/curobo/util/usd_helper.py
@@ -16,7 +16,6 @@ from typing import Dict, List, Optional, Union
 # Third Party
 import numpy as np
 import torch
-from pxr import Gf, Sdf, Usd, UsdGeom, UsdPhysics, UsdShade
 from tqdm import tqdm
 
 # CuRobo
@@ -47,6 +46,15 @@ from curobo.util_file import (
 )
 from curobo.wrap.reacher.motion_gen import MotionGenResult
 
+try:
+    # Third Party
+    from pxr import Gf, Sdf, Usd, UsdGeom, UsdPhysics, UsdShade
+except ImportError:
+    raise ImportError(
+        "usd-core failed to import, install with pip install usd-core"
+        + " NOTE: Do not install this if using with ISAAC SIM."
+    )
+
 
 def set_prim_translate(prim, translation):
     UsdGeom.Xformable(prim).AddTranslateOp().Set(Gf.Vec3d(translation))
diff --git a/src/curobo/util/warp.py b/src/curobo/util/warp.py
index e8a2c7b..764e10d 100644
--- a/src/curobo/util/warp.py
+++ b/src/curobo/util/warp.py
@@ -24,5 +24,5 @@ def init_warp(quiet=True, tensor_args: TensorDeviceType = TensorDeviceType()):
     # wp.config.enable_backward = True
     wp.init()
 
-    wp.force_load(wp.device_from_torch(tensor_args.device))
+    # wp.force_load(wp.device_from_torch(tensor_args.device))
     return True
diff --git a/src/curobo/wrap/model/robot_segmenter.py b/src/curobo/wrap/model/robot_segmenter.py
new file mode 100644
index 0000000..05b450a
--- /dev/null
+++ b/src/curobo/wrap/model/robot_segmenter.py
@@ -0,0 +1,204 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from typing import Dict, Optional, Tuple, Union
+
+# Third Party
+import torch
+from torch.profiler import record_function
+
+# CuRobo
+from curobo.geom.cv import (
+    get_projection_rays,
+    project_depth_using_rays,
+)
+from curobo.geom.types import PointCloud
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import log_error
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+
+class RobotSegmenter:
+    def __init__(
+        self,
+        robot_world: RobotWorld,
+        distance_threshold: float = 0.05,
+        use_cuda_graph: bool = True,
+    ):
+        self._robot_world = robot_world
+        self._projection_rays = None
+        self.ready = False
+        self._out_points_buffer = None
+        self._out_gp = None
+        self._out_gq = None
+        self._out_gpt = None
+        self._cu_graph = None
+        self._use_cuda_graph = use_cuda_graph
+        self.tensor_args = robot_world.tensor_args
+        self.distance_threshold = distance_threshold
+
+    @staticmethod
+    def from_robot_file(
+        robot_file: Union[str, Dict],
+        collision_sphere_buffer: Optional[float],
+        distance_threshold: float = 0.05,
+        use_cuda_graph: bool = True,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+        if collision_sphere_buffer is not None:
+            robot_dict["kinematics"]["collision_sphere_buffer"] = collision_sphere_buffer
+
+        robot_cfg = RobotConfig.from_dict(robot_dict, tensor_args=tensor_args)
+
+        config = RobotWorldConfig.load_from_config(
+            robot_cfg,
+            None,
+            collision_activation_distance=0.0,
+            tensor_args=tensor_args,
+        )
+        robot_world = RobotWorld(config)
+
+        return RobotSegmenter(
+            robot_world, distance_threshold=distance_threshold, use_cuda_graph=use_cuda_graph
+        )
+
+    def get_pointcloud_from_depth(self, camera_obs: CameraObservation):
+        if self._projection_rays is None:
+            self.update_camera_projection(camera_obs)
+        depth_image = camera_obs.depth_image
+        if len(depth_image.shape) == 2:
+            depth_image = depth_image.unsqueeze(0)
+        points = project_depth_using_rays(depth_image, self._projection_rays)
+
+        return points
+
+    def update_camera_projection(self, camera_obs: CameraObservation):
+        intrinsics = camera_obs.intrinsics
+        if len(intrinsics.shape) == 2:
+            intrinsics = intrinsics.unsqueeze(0)
+        project_rays = get_projection_rays(
+            camera_obs.depth_image.shape[-2], camera_obs.depth_image.shape[-1], intrinsics
+        )
+
+        if self._projection_rays is None:
+            self._projection_rays = project_rays
+
+        self._projection_rays.copy_(project_rays)
+        self.ready = True
+
+    @record_function("robot_segmenter/get_robot_mask")
+    def get_robot_mask(
+        self,
+        camera_obs: CameraObservation,
+        joint_state: JointState,
+    ):
+        """
+        Assumes 1 robot and batch of depth images, batch of poses
+        """
+        if len(camera_obs.depth_image.shape) != 3:
+            log_error("Send depth image as (batch, height, width)")
+
+        active_js = self._robot_world.get_active_js(joint_state)
+
+        mask, filtered_image = self.get_robot_mask_from_active_js(camera_obs, active_js)
+
+        return mask, filtered_image
+
+    def get_robot_mask_from_active_js(
+        self, camera_obs: CameraObservation, active_joint_state: JointState
+    ):
+        q = active_joint_state.position
+        mask, filtered_image = self._call_op(camera_obs, q)
+
+        return mask, filtered_image
+
+    def _create_cg_graph(self, cam_obs, q):
+        self._cu_cam_obs = cam_obs.clone()
+        self._cu_q = q.clone()
+        s = torch.cuda.Stream(device=self.tensor_args.device)
+        s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                self._cu_out, self._cu_filtered_out = self._mask_op(self._cu_cam_obs, self._cu_q)
+        torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+
+        self._cu_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self._cu_graph, stream=s):
+            self._cu_out, self._cu_filtered_out = self._mask_op(
+                self._cu_cam_obs,
+                self._cu_q,
+            )
+
+    def _call_op(self, cam_obs, q):
+        if self._use_cuda_graph:
+            if self._cu_graph is None:
+                self._create_cg_graph(cam_obs, q)
+            self._cu_cam_obs.copy_(cam_obs)
+            self._cu_q.copy_(q)
+            self._cu_graph.replay()
+            return self._cu_out.clone(), self._cu_filtered_out.clone()
+        return self._mask_op(cam_obs, q)
+
+    @record_function("robot_segmenter/_mask_op")
+    def _mask_op(self, camera_obs, q):
+        if len(q.shape) == 1:
+            q = q.unsqueeze(0)
+        points = self.get_pointcloud_from_depth(camera_obs)
+        camera_to_robot = camera_obs.pose
+
+        if self._out_points_buffer is None:
+            self._out_points_buffer = points.clone()
+        if self._out_gpt is None:
+            self._out_gpt = torch.zeros((points.shape[0], points.shape[1], 3), device=points.device)
+        if self._out_gp is None:
+            self._out_gp = torch.zeros((camera_to_robot.position.shape[0], 3), device=points.device)
+        if self._out_gq is None:
+            self._out_gq = torch.zeros(
+                (camera_to_robot.quaternion.shape[0], 4), device=points.device
+            )
+
+        points_in_robot_frame = camera_to_robot.batch_transform_points(
+            points,
+            out_buffer=self._out_points_buffer,
+            gp_out=self._out_gp,
+            gq_out=self._out_gq,
+            gpt_out=self._out_gpt,
+        )
+
+        out_points = points_in_robot_frame
+
+        dist = self._robot_world.get_point_robot_distance(out_points, q)
+
+        mask, filtered_image = mask_image(camera_obs.depth_image, dist, self.distance_threshold)
+
+        return mask, filtered_image
+
+
+@torch.jit.script
+def mask_image(
+    image: torch.Tensor, distance: torch.Tensor, distance_threshold: float
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    distance = distance.view(
+        image.shape[0],
+        image.shape[1],
+        image.shape[2],
+    )
+    mask = torch.logical_and((image > 0.0), (distance > -distance_threshold))
+    filtered_image = torch.where(mask, 0, image)
+    return mask, filtered_image
diff --git a/src/curobo/wrap/model/robot_world.py b/src/curobo/wrap/model/robot_world.py
index 69bbfa0..2deba06 100644
--- a/src/curobo/wrap/model/robot_world.py
+++ b/src/curobo/wrap/model/robot_world.py
@@ -34,6 +34,7 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.types.robot import RobotConfig
 from curobo.types.state import JointState
+from curobo.util.logger import log_error
 from curobo.util.sample_lib import HaltonGenerator
 from curobo.util.warp import init_warp
 from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
@@ -60,8 +61,8 @@ class RobotWorldConfig:
         world_model: Union[None, str, Dict, WorldConfig, List[WorldConfig], List[str]] = None,
         tensor_args: TensorDeviceType = TensorDeviceType(),
         n_envs: int = 1,
-        n_meshes: int = 10,
-        n_cuboids: int = 10,
+        n_meshes: int = 50,
+        n_cuboids: int = 50,
         collision_activation_distance: float = 0.2,
         self_collision_activation_distance: float = 0.0,
         max_collision_distance: float = 1.0,
@@ -74,6 +75,8 @@ class RobotWorldConfig:
         if isinstance(robot_config, str):
             robot_config = load_yaml(join_path(get_robot_configs_path(), robot_config))["robot_cfg"]
         if isinstance(robot_config, Dict):
+            if "robot_cfg" in robot_config:
+                robot_config = robot_config["robot_cfg"]
             robot_config = RobotConfig.from_dict(robot_config, tensor_args)
         kinematics = CudaRobotModel(robot_config.kinematics)
 
@@ -178,8 +181,11 @@ class RobotWorld(RobotWorldConfig):
     def __init__(self, config: RobotWorldConfig) -> None:
         RobotWorldConfig.__init__(self, **vars(config))
         self._batch_pose_idx = None
+        self._camera_projection_rays = None
 
     def get_kinematics(self, q: torch.Tensor) -> CudaRobotModelState:
+        if len(q.shape) == 1:
+            log_error("q should be of shape [b, dof]")
         state = self.kinematics.get_state(q)
         return state
 
@@ -344,6 +350,37 @@ class RobotWorld(RobotWorldConfig):
         distance = self.pose_cost.forward_pose(x_des, x_current, self._batch_pose_idx)
         return distance
 
+    def get_point_robot_distance(self, points: torch.Tensor, q: torch.Tensor):
+        """Compute distance from the robot at q joint configuration to points (e.g., pointcloud)
+
+        Args:
+            points: [b,n,3]
+            q: [1, dof]
+
+        Returns:
+            distance: [b,1] Positive is in collision with robot
+        NOTE: This currently does not support batched robot but can be done easily.
+        """
+        if len(q.shape) == 1:
+            log_error("q should be of shape [b, dof]")
+        kin_state = self.get_kinematics(q)
+        b, n = None, None
+        if len(points.shape) == 3:
+            b, n, _ = points.shape
+            points = points.view(b * n, 3)
+
+        pt_distance = point_robot_distance(kin_state.link_spheres_tensor, points)
+
+        if b is not None:
+            pt_distance = pt_distance.view(b, n)
+
+        return pt_distance
+
+    def get_active_js(self, full_js: JointState):
+        active_jnames = self.kinematics.joint_names
+        out_js = full_js.get_ordered_joint_state(active_jnames)
+        return out_js
+
 
 @torch.jit.script
 def sum_mask(d1, d2, d3):
@@ -357,3 +394,15 @@ def mask(d1, d2, d3):
     d_total = d1 + d2 + d3
     d_mask = d_total == 0.0
     return d_mask
+
+
+@torch.jit.script
+def point_robot_distance(link_spheres_tensor, points):
+    robot_spheres = link_spheres_tensor.view(1, -1, 4).contiguous()
+    robot_radius = robot_spheres[:, :, 3]
+    points = points.unsqueeze(1)
+    sph_distance = (
+        torch.linalg.norm(points - robot_spheres[:, :, :3], dim=-1) - robot_radius
+    )  # b, n_spheres
+    pt_distance = torch.max(-1 * sph_distance, dim=-1)[0]
+    return pt_distance
diff --git a/src/curobo/wrap/reacher/evaluator.py b/src/curobo/wrap/reacher/evaluator.py
index 64e958f..2622efa 100644
--- a/src/curobo/wrap/reacher/evaluator.py
+++ b/src/curobo/wrap/reacher/evaluator.py
@@ -52,7 +52,8 @@ def smooth_cost(abs_acc, abs_jerk, opt_dt):
     # jerk = torch.max(torch.max(abs_jerk, dim=-1)[0], dim=-1)[0]
     jerk = torch.mean(torch.max(abs_jerk, dim=-1)[0], dim=-1)
     mean_acc = torch.mean(torch.max(abs_acc, dim=-1)[0], dim=-1)  # [0]
-    a = (jerk * 0.001) + opt_dt + (mean_acc * 0.01)
+    a = (jerk * 0.001) + 5.0 * opt_dt + (mean_acc * 0.01)
+
     return a
 
 
diff --git a/src/curobo/wrap/reacher/ik_solver.py b/src/curobo/wrap/reacher/ik_solver.py
index b8f57a0..9192dc1 100644
--- a/src/curobo/wrap/reacher/ik_solver.py
+++ b/src/curobo/wrap/reacher/ik_solver.py
@@ -26,6 +26,7 @@ from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
 from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
 from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
 from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.cost.pose_cost import PoseCostMetric
 from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
@@ -56,6 +57,7 @@ class IKSolverConfig:
     world_coll_checker: Optional[WorldCollision] = None
     sample_rejection_ratio: int = 50
     tensor_args: TensorDeviceType = TensorDeviceType()
+    use_cuda_graph: bool = True
 
     @staticmethod
     @profiler.record_function("ik_solver/load_from_robot_config")
@@ -72,12 +74,12 @@ class IKSolverConfig:
         base_cfg_file: str = "base_cfg.yml",
         particle_file: str = "particle_ik.yml",
         gradient_file: str = "gradient_ik.yml",
-        use_cuda_graph: Optional[bool] = None,
+        use_cuda_graph: bool = True,
         self_collision_check: bool = True,
         self_collision_opt: bool = True,
         grad_iters: Optional[int] = None,
         use_particle_opt: bool = True,
-        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.MESH,
         sync_cuda_time: Optional[bool] = None,
         use_gradient_descent: bool = False,
         collision_cache: Optional[Dict[str, int]] = None,
@@ -90,6 +92,7 @@ class IKSolverConfig:
         regularization: bool = True,
         collision_activation_distance: Optional[float] = None,
         high_precision: bool = False,
+        project_pose_to_goal_frame: bool = True,
     ):
         if position_threshold <= 0.001:
             high_precision = True
@@ -116,6 +119,9 @@ class IKSolverConfig:
             base_config_data["convergence"]["cspace_cfg"]["weight"] = 0.0
             config_data["cost"]["bound_cfg"]["null_space_weight"] = 0.0
             grad_config_data["cost"]["bound_cfg"]["null_space_weight"] = 0.0
+
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
         if ee_link_name is not None:
             if isinstance(robot_cfg, RobotConfig):
                 raise NotImplementedError("ee link cannot be changed after creating RobotConfig")
@@ -123,8 +129,6 @@ class IKSolverConfig:
                 robot_cfg.kinematics.ee_link = ee_link_name
             else:
                 robot_cfg["kinematics"]["ee_link"] = ee_link_name
-        if isinstance(robot_cfg, str):
-            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
         if isinstance(robot_cfg, dict):
             robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
 
@@ -160,8 +164,8 @@ class IKSolverConfig:
             grad_config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
         if grad_iters is not None:
             grad_config_data["lbfgs"]["n_iters"] = grad_iters
-        config_data["mppi"]["n_envs"] = 1
-        grad_config_data["lbfgs"]["n_envs"] = 1
+        config_data["mppi"]["n_problems"] = 1
+        grad_config_data["lbfgs"]["n_problems"] = 1
         grad_cfg = ArmReacherConfig.from_dict(
             robot_cfg,
             grad_config_data["model"],
@@ -241,6 +245,7 @@ class IKSolverConfig:
             world_coll_checker=world_coll_checker,
             rollout_fn=aux_rollout,
             tensor_args=tensor_args,
+            use_cuda_graph=use_cuda_graph,
         )
         return ik_cfg
 
@@ -259,6 +264,7 @@ class IKResult(Sequence):
     error: T_BValue_float
     solve_time: float
     debug_info: Optional[Any] = None
+    goalset_index: Optional[torch.Tensor] = None
 
     def __getitem__(self, idx):
         success = self.success[idx]
@@ -272,6 +278,7 @@ class IKResult(Sequence):
             position_error=self.position_error[idx],
             rotation_error=self.rotation_error[idx],
             debug_info=self.debug_info,
+            goalset_index=None if self.goalset_index is None else self.goalset_index[idx],
         )
 
     def __len__(self):
@@ -317,7 +324,7 @@ class IKSolver(IKSolverConfig):
             self.solver.rollout_fn.retract_state.unsqueeze(0)
         )
         self.dof = self.solver.safety_rollout.d_action
-        self._col = torch.arange(0, 1, device=self.tensor_args.device, dtype=torch.long)
+        self._col = None  # torch.arange(0, 1, device=self.tensor_args.device, dtype=torch.long)
 
         # self.fixed_seeds = self.solver.safety_rollout.sample_random_actions(100 * 200)
         # create random seeder:
@@ -355,10 +362,14 @@ class IKSolver(IKSolverConfig):
             self._goal_buffer,
             self.tensor_args,
         )
-        # print("Goal:", self._goal_buffer.links_goal_pose)
+
         if update_reference:
-            self.solver.update_nenvs(self._solve_state.get_ik_batch_size())
-            self.reset_cuda_graph()
+            self.reset_shape()
+            if self.use_cuda_graph and self._col is not None:
+                log_error("changing goal type, breaking previous cuda graph.")
+                self.reset_cuda_graph()
+
+            self.solver.update_nproblems(self._solve_state.get_ik_batch_size())
             self._goal_buffer.current_state = self.init_state.repeat_seeds(goal_pose.batch)
             self._col = torch.arange(
                 0,
@@ -676,6 +687,8 @@ class IKSolver(IKSolverConfig):
         if newton_iters is not None:
             self.solver.newton_optimizer.outer_iters = self.og_newton_iters
         ik_result = self.get_result(num_seeds, result, goal_buffer.goal_pose, return_seeds)
+        if ik_result.goalset_index is not None:
+            ik_result.goalset_index[ik_result.goalset_index >= goal_pose.n_goalset] = 0
 
         return ik_result
 
@@ -684,15 +697,18 @@ class IKSolver(IKSolverConfig):
         self, num_seeds: int, result: WrapResult, goal_pose: Pose, return_seeds: int
     ) -> IKResult:
         success = self.get_success(result.metrics, num_seeds=num_seeds)
-        if result.metrics.cost is not None:
-            result.metrics.pose_error += result.metrics.cost
+        # if result.metrics.cost is not None:
+        #    result.metrics.pose_error += result.metrics.cost * 0.0001
+        if result.metrics.null_space_error is not None:
+            result.metrics.pose_error += result.metrics.null_space_error
         if result.metrics.cspace_error is not None:
             result.metrics.pose_error += result.metrics.cspace_error
 
-        q_sol, success, position_error, rotation_error, total_error = get_result(
+        q_sol, success, position_error, rotation_error, total_error, goalset_index = get_result(
             result.metrics.pose_error,
             result.metrics.position_error,
             result.metrics.rotation_error,
+            result.metrics.goalset_index,
             success,
             result.action.position,
             self._col,
@@ -717,6 +733,7 @@ class IKSolver(IKSolverConfig):
             solve_time=result.solve_time,
             error=total_error,
             debug_info={"solver": result.debug},
+            goalset_index=goalset_index,
         )
         return ik_result
 
@@ -959,6 +976,10 @@ class IKSolver(IKSolverConfig):
         self.solver.reset_cuda_graph()
         self.rollout_fn.reset_cuda_graph()
 
+    def reset_shape(self):
+        self.solver.reset_shape()
+        self.rollout_fn.reset_shape()
+
     def attach_object_to_robot(
         self,
         sphere_radius: float,
@@ -977,6 +998,17 @@ class IKSolver(IKSolverConfig):
     def get_retract_config(self):
         return self.rollout_fn.dynamics_model.retract_config
 
+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+    ):
+        rollouts = self.get_all_rollout_instances()
+        [
+            rollout.update_pose_cost_metric(metric)
+            for rollout in rollouts
+            if isinstance(rollout, ArmReacher)
+        ]
+
 
 @torch.jit.script
 def get_success(
@@ -1001,6 +1033,7 @@ def get_result(
     pose_error,
     position_error,
     rotation_error,
+    goalset_index: Union[torch.Tensor, None],
     success,
     sol_position,
     col,
@@ -1018,4 +1051,6 @@ def get_result(
     position_error = position_error[idx].view(batch_size, return_seeds)
     rotation_error = rotation_error[idx].view(batch_size, return_seeds)
     total_error = position_error + rotation_error
-    return q_sol, success, position_error, rotation_error, total_error
+    if goalset_index is not None:
+        goalset_index = goalset_index[idx].view(batch_size, return_seeds)
+    return q_sol, success, position_error, rotation_error, total_error, goalset_index
diff --git a/src/curobo/wrap/reacher/motion_gen.py b/src/curobo/wrap/reacher/motion_gen.py
index 2c767f3..b152e09 100644
--- a/src/curobo/wrap/reacher/motion_gen.py
+++ b/src/curobo/wrap/reacher/motion_gen.py
@@ -32,6 +32,8 @@ from curobo.geom.sphere_fit import SphereFitType
 from curobo.geom.types import Cuboid, Obstacle, WorldConfig
 from curobo.graph.graph_base import GraphConfig, GraphPlanBase, GraphResult
 from curobo.graph.prm import PRMStar
+from curobo.rollout.arm_reacher import ArmReacher
+from curobo.rollout.cost.pose_cost import PoseCostMetric
 from curobo.rollout.dynamics_model.kinematic_model import KinematicModelState
 from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
 from curobo.types.base import TensorDeviceType
@@ -59,7 +61,7 @@ from curobo.wrap.reacher.types import ReacherSolveState, ReacherSolveType
 
 @dataclass
 class MotionGenConfig:
-    """Creates a configuration file for running motion generation."""
+    """Configuration dataclass for creating a motion generation instance."""
 
     #: number of IK seeds to run per query problem.
     ik_seeds: int
@@ -122,7 +124,10 @@ class MotionGenConfig:
     interpolation_dt: float = 0.01
 
     #: scale initial dt by this value to finetune trajectory optimization.
-    finetune_dt_scale: float = 0.98
+    finetune_dt_scale: float = 0.9
+
+    #: record compute ops as cuda graphs and replay recorded graphs for upto 10x faster execution.
+    use_cuda_graph: bool = True
 
     @staticmethod
     @profiler.record_function("motion_gen_config/load_from_robot_config")
@@ -146,19 +151,19 @@ class MotionGenConfig:
         graph_file: str = "graph.yml",
         particle_trajopt_file: str = "particle_trajopt.yml",
         gradient_trajopt_file: str = "gradient_trajopt.yml",
-        finetune_trajopt_file: str = "finetune_trajopt.yml",
+        finetune_trajopt_file: Optional[str] = None,
         trajopt_tsteps: int = 32,
         interpolation_steps: int = 5000,
         interpolation_dt: float = 0.02,
         interpolation_type: InterpolateType = InterpolateType.LINEAR_CUDA,
-        use_cuda_graph: Optional[bool] = None,
+        use_cuda_graph: bool = True,
         self_collision_check: bool = True,
         self_collision_opt: bool = True,
         grad_trajopt_iters: Optional[int] = None,
         trajopt_seed_ratio: Dict[str, int] = {"linear": 1.0, "bias": 0.0},
         ik_opt_iters: Optional[int] = None,
         ik_particle_opt: bool = True,
-        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.MESH,
         sync_cuda_time: Optional[bool] = None,
         trajopt_particle_opt: bool = True,
         traj_evaluator_config: Optional[TrajEvaluatorConfig] = None,
@@ -199,84 +204,86 @@ class MotionGenConfig:
         acceleration_scale: Optional[Union[List[float], float]] = None,
         jerk_scale: Optional[Union[List[float], float]] = None,
         optimize_dt: bool = True,
+        project_pose_to_goal_frame: bool = True,
     ):
-        """Load motion generation configuration from robot and world configurations.
+        """Helper function to create configuration from robot and world configuration.
 
         Args:
-            robot_cfg (Union[Union[str, Dict], RobotConfig]): robot configuration to load.
-            world_model (Optional[Union[Union[str, Dict], WorldConfig]], optional): World configuration to load. Defaults to None.
-            tensor_args (TensorDeviceType, optional): _description_. Defaults to TensorDeviceType().
-            num_ik_seeds (int, optional): _description_. Defaults to 30.
-            num_graph_seeds (int, optional): _description_. Defaults to 1.
-            num_trajopt_seeds (int, optional): _description_. Defaults to 12.
-            num_batch_ik_seeds (int, optional): _description_. Defaults to 30.
-            num_batch_trajopt_seeds (int, optional): _description_. Defaults to 1.
-            num_trajopt_noisy_seeds (int, optional): _description_. Defaults to 1.
-            position_threshold (float, optional): _description_. Defaults to 0.005.
-            rotation_threshold (float, optional): _description_. Defaults to 0.05.
-            cspace_threshold (float, optional): _description_. Defaults to 0.05.
-            world_coll_checker (_type_, optional): _description_. Defaults to None.
-            base_cfg_file (str, optional): _description_. Defaults to "base_cfg.yml".
-            particle_ik_file (str, optional): _description_. Defaults to "particle_ik.yml".
-            gradient_ik_file (str, optional): _description_. Defaults to "gradient_ik.yml".
-            graph_file (str, optional): _description_. Defaults to "graph.yml".
-            particle_trajopt_file (str, optional): _description_. Defaults to "particle_trajopt.yml".
-            gradient_trajopt_file (str, optional): _description_. Defaults to "gradient_trajopt.yml".
-            finetune_trajopt_file (str, optional): _description_. Defaults to "finetune_trajopt.yml".
-            trajopt_tsteps (int, optional): _description_. Defaults to 32.
-            interpolation_steps (int, optional): _description_. Defaults to 5000.
-            interpolation_dt (float, optional): _description_. Defaults to 0.02.
-            interpolation_type (InterpolateType, optional): _description_. Defaults to InterpolateType.LINEAR_CUDA.
-            use_cuda_graph (Optional[bool], optional): _description_. Defaults to None.
-            self_collision_check (bool, optional): _description_. Defaults to True.
-            self_collision_opt (bool, optional): _description_. Defaults to True.
-            grad_trajopt_iters (Optional[int], optional): _description_. Defaults to None.
-            trajopt_seed_ratio (_type_, optional): _description_. Defaults to {"linear": 1.0, "bias": 0.0}.
-            ik_opt_iters (Optional[int], optional): _description_. Defaults to None.
-            ik_particle_opt (bool, optional): _description_. Defaults to True.
-            collision_checker_type (Optional[CollisionCheckerType], optional): _description_. Defaults to CollisionCheckerType.PRIMITIVE.
-            sync_cuda_time (Optional[bool], optional): _description_. Defaults to None.
-            trajopt_particle_opt (bool, optional): _description_. Defaults to True.
-            traj_evaluator_config (Optional[TrajEvaluatorConfig], optional): _description_. Defaults to None.
-            traj_evaluator (Optional[TrajEvaluator], optional): _description_. Defaults to None.
-            minimize_jerk (bool, optional): _description_. Defaults to True.
-            filter_robot_command (bool, optional): _description_. Defaults to True.
-            use_gradient_descent (bool, optional): _description_. Defaults to False.
-            collision_cache (Optional[Dict[str, int]], optional): _description_. Defaults to None.
-            n_collision_envs (Optional[int], optional): _description_. Defaults to None.
-            ee_link_name (Optional[str], optional): _description_. Defaults to None.
-            use_es_ik (Optional[bool], optional): _description_. Defaults to None.
-            use_es_trajopt (Optional[bool], optional): _description_. Defaults to None.
-            es_ik_learning_rate (float, optional): _description_. Defaults to 1.0.
-            es_trajopt_learning_rate (float, optional): _description_. Defaults to 1.0.
-            use_ik_fixed_samples (Optional[bool], optional): _description_. Defaults to None.
-            use_trajopt_fixed_samples (Optional[bool], optional): _description_. Defaults to None.
-            evaluate_interpolated_trajectory (bool, optional): _description_. Defaults to True.
-            partial_ik_iters (int, optional): _description_. Defaults to 2.
-            fixed_iters_trajopt (Optional[bool], optional): _description_. Defaults to None.
-            store_ik_debug (bool, optional): _description_. Defaults to False.
-            store_trajopt_debug (bool, optional): _description_. Defaults to False.
-            graph_trajopt_iters (Optional[int], optional): _description_. Defaults to None.
-            collision_max_outside_distance (Optional[float], optional): _description_. Defaults to None.
-            collision_activation_distance (Optional[float], optional): _description_. Defaults to None.
-            trajopt_dt (Optional[float], optional): _description_. Defaults to None.
-            js_trajopt_dt (Optional[float], optional): _description_. Defaults to None.
-            js_trajopt_tsteps (Optional[int], optional): _description_. Defaults to None.
-            trim_steps (Optional[List[int]], optional): _description_. Defaults to None.
-            store_debug_in_result (bool, optional): _description_. Defaults to False.
-            finetune_trajopt_iters (Optional[int], optional): _description_. Defaults to None.
-            smooth_weight (List[float], optional): _description_. Defaults to None.
-            finetune_smooth_weight (Optional[List[float]], optional): _description_. Defaults to None.
-            state_finite_difference_mode (Optional[str], optional): _description_. Defaults to None.
-            finetune_dt_scale (float, optional): _description_. Defaults to 1.05.
-            maximum_trajectory_time (Optional[float], optional): _description_. Defaults to None.
-            maximum_trajectory_dt (float, optional): _description_. Defaults to 0.1.
-            velocity_scale (Optional[Union[List[float], float]], optional): _description_. Defaults to None.
-            acceleration_scale (Optional[Union[List[float], float]], optional): _description_. Defaults to None.
-            jerk_scale (Optional[Union[List[float], float]], optional): _description_. Defaults to None.
+            robot_cfg: Robot configuration to use for motion generation.
+            world_model: World model to use for motion generation. Use None to disable world model.
+            tensor_args: Tensor device to use for motion generation. Use to change cuda device id.
+            num_ik_seeds: Number of seeds to use in inverse kinematics (IK) optimization.
+            num_graph_seeds: Number of graph paths to use as seed in trajectory optimization.
+            num_trajopt_seeds: Number of seeds to use in trajectory optimization.
+            num_batch_ik_seeds: Number of seeds to use in batch planning modes for IK.
+            num_batch_trajopt_seeds: Number of seeds to use in batch planning modes for trajopt.
+            num_trajopt_noisy_seeds: Number of seeds to use for trajopt.
+            position_threshold: _description_
+            rotation_threshold: _description_
+            cspace_threshold: _description_
+            world_coll_checker: _description_
+            base_cfg_file: _description_
+            particle_ik_file: _description_
+            gradient_ik_file: _description_
+            graph_file: _description_
+            particle_trajopt_file: _description_
+            gradient_trajopt_file: _description_
+            finetune_trajopt_file: _description_
+            trajopt_tsteps: _description_
+            interpolation_steps: _description_
+            interpolation_dt: _description_
+            interpolation_type: _description_
+            use_cuda_graph: _description_
+            self_collision_check: _description_
+            self_collision_opt: _description_
+            grad_trajopt_iters: _description_
+            trajopt_seed_ratio: _description_
+            ik_opt_iters: _description_
+            ik_particle_opt: _description_
+            collision_checker_type: _description_
+            sync_cuda_time: _description_
+            trajopt_particle_opt: _description_
+            traj_evaluator_config: _description_
+            traj_evaluator: _description_
+            minimize_jerk: _description_
+            filter_robot_command: _description_
+            use_gradient_descent: _description_
+            collision_cache: _description_
+            n_collision_envs: _description_
+            ee_link_name: _description_
+            use_es_ik: _description_
+            use_es_trajopt: _description_
+            es_ik_learning_rate: _description_
+            es_trajopt_learning_rate: _description_
+            use_ik_fixed_samples: _description_
+            use_trajopt_fixed_samples: _description_
+            evaluate_interpolated_trajectory: _description_
+            partial_ik_iters: _description_
+            fixed_iters_trajopt: _description_
+            store_ik_debug: _description_
+            store_trajopt_debug: _description_
+            graph_trajopt_iters: _description_
+            collision_max_outside_distance: _description_
+            collision_activation_distance: _description_
+            trajopt_dt: _description_
+            js_trajopt_dt: _description_
+            js_trajopt_tsteps: _description_
+            trim_steps: _description_
+            store_debug_in_result: _description_
+            finetune_trajopt_iters: _description_
+            smooth_weight: _description_
+            finetune_smooth_weight: _description_
+            state_finite_difference_mode: _description_
+            finetune_dt_scale: _description_
+            maximum_trajectory_time: _description_
+            maximum_trajectory_dt: _description_
+            velocity_scale: _description_
+            acceleration_scale: _description_
+            jerk_scale: _description_
+            optimize_dt: _description_
 
         Returns:
-            _type_: _description_
+            _description_
         """
 
         init_warp(tensor_args=tensor_args)
@@ -306,7 +313,7 @@ class MotionGenConfig:
             and min(velocity_scale) < 1.0
             and maximum_trajectory_dt <= 0.1
         ):
-            maximum_trajectory_dt = np.sqrt(1.0 / min(velocity_scale)) * maximum_trajectory_dt
+            maximum_trajectory_dt = (1.0 / min(velocity_scale)) * maximum_trajectory_dt
 
         if traj_evaluator_config is None:
             if maximum_trajectory_dt is not None:
@@ -316,21 +323,54 @@ class MotionGenConfig:
             if acceleration_scale is not None:
                 max_dt = max_dt * (1.0 / np.sqrt(min(acceleration_scale)))
             traj_evaluator_config = TrajEvaluatorConfig(min_dt=interpolation_dt, max_dt=max_dt)
+        if maximum_trajectory_dt is not None:
+            if trajopt_dt is None:
+                trajopt_dt = maximum_trajectory_dt
+            if js_trajopt_dt is None:
+                js_trajopt_dt = maximum_trajectory_dt
         if acceleration_scale is not None and min(acceleration_scale) < 0.5:
             fixed_iters_trajopt = True
         if velocity_scale is not None and min(velocity_scale) < 0.5:
             fixed_iters_trajopt = True
 
+        if (
+            velocity_scale is not None
+            and min(velocity_scale) < 0.1
+            and finetune_trajopt_file is None
+        ):
+            log_error(
+                "velocity scale<0.1 is not supported with default finetune_trajopt.yml "
+                + "provide your own finetune_trajopt_file to override this error"
+            )
+
+        if (
+            velocity_scale is not None
+            and min(velocity_scale) <= 0.25
+            and finetune_trajopt_file is None
+        ):
+            finetune_trajopt_file = "finetune_trajopt_slow.yml"
+
         if isinstance(robot_cfg, str):
             robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
         elif isinstance(robot_cfg, Dict) and "robot_cfg" in robot_cfg.keys():
             robot_cfg = robot_cfg["robot_cfg"]
         if isinstance(robot_cfg, RobotConfig):
-            if ee_link_name is not None:
+            if (
+                ee_link_name is not None
+                and robot_cfg.kinematics.kinematics_config.ee_link != ee_link_name
+            ):
                 log_error("ee link cannot be changed after creating RobotConfig")
-            if acceleration_scale is not None:
+            if (
+                acceleration_scale is not None
+                and torch.max(robot_cfg.kinematics.kinematics_config.cspace.acceleration_scale)
+                != acceleration_scale
+            ):
                 log_error("acceleration_scale cannot be changed after creating RobotConfig")
-            if velocity_scale is not None:
+            if (
+                velocity_scale is not None
+                and torch.max(robot_cfg.kinematics.kinematics_config.cspace.velocity_scale)
+                != velocity_scale
+            ):
                 log_error("velocity cannot be changed after creating RobotConfig")
         else:
             if ee_link_name is not None:
@@ -395,6 +435,7 @@ class MotionGenConfig:
             use_fixed_samples=use_ik_fixed_samples,
             store_debug=store_ik_debug,
             collision_activation_distance=collision_activation_distance,
+            project_pose_to_goal_frame=project_pose_to_goal_frame,
         )
 
         ik_solver = IKSolver(ik_solver_cfg)
@@ -445,7 +486,6 @@ class MotionGenConfig:
             store_debug=store_trajopt_debug,
             collision_activation_distance=collision_activation_distance,
             trajopt_dt=trajopt_dt,
-            # trim_steps=trim_steps,
             store_debug_in_result=store_debug_in_result,
             smooth_weight=smooth_weight,
             cspace_threshold=cspace_threshold,
@@ -453,6 +493,7 @@ class MotionGenConfig:
             filter_robot_command=filter_robot_command,
             minimize_jerk=minimize_jerk,
             optimize_dt=optimize_dt,
+            project_pose_to_goal_frame=project_pose_to_goal_frame,
         )
         trajopt_solver = TrajOptSolver(trajopt_cfg)
 
@@ -496,6 +537,9 @@ class MotionGenConfig:
         )
         js_trajopt_solver = TrajOptSolver(js_trajopt_cfg)
 
+        if finetune_trajopt_file is None:
+            finetune_trajopt_file = "finetune_trajopt.yml"
+
         finetune_trajopt_cfg = TrajOptSolverConfig.load_from_robot_config(
             robot_cfg=robot_cfg,
             world_model=world_model,
@@ -531,6 +575,7 @@ class MotionGenConfig:
             use_gradient_descent=use_gradient_descent,
             filter_robot_command=filter_robot_command,
             optimize_dt=optimize_dt,
+            project_pose_to_goal_frame=project_pose_to_goal_frame,
         )
 
         finetune_trajopt_solver = TrajOptSolver(finetune_trajopt_cfg)
@@ -562,6 +607,7 @@ class MotionGenConfig:
             store_debug_in_result=store_debug_in_result,
             interpolation_dt=interpolation_dt,
             finetune_dt_scale=finetune_dt_scale,
+            use_cuda_graph=use_cuda_graph,
         )
 
 
@@ -636,6 +682,9 @@ class MotionGenResult:
     #: stores graph plan.
     graph_plan: Optional[JointState] = None
 
+    #: stores the index of the goal pose reached when planning for a goalset.
+    goalset_index: Optional[torch.Tensor] = None
+
     def clone(self):
         m = MotionGenResult(
             self.success.clone(),
@@ -657,10 +706,11 @@ class MotionGenResult:
             trajopt_attempts=self.trajopt_attempts,
             used_graph=self.used_graph,
             path_buffer_last_tstep=self.path_buffer_last_tstep,
-            interpolated_plan=self.interpolated_plan.clone()
-            if self.interpolated_plan is not None
-            else None,
+            interpolated_plan=(
+                self.interpolated_plan.clone() if self.interpolated_plan is not None else None
+            ),
             interpolation_dt=self.interpolation_dt,
+            goalset_index=self.goalset_index.clone() if self.goalset_index is not None else None,
         )
         return m
 
@@ -709,6 +759,9 @@ class MotionGenResult:
             self.cspace_error, source_result.cspace_error, idx
         )
 
+        self.goalset_index = self._check_none_and_copy_idx(
+            self.goalset_index, source_result.goalset_index, idx
+        )
         # NOTE: graph plan will have different shape based on success.
         # self.graph_plan = self._check_none_and_copy_idx(
         #    self.graph_plan, source_result.graph_plan, idx
@@ -778,9 +831,15 @@ class MotionGenPlanConfig:
     num_trajopt_seeds: Optional[int] = None
     success_ratio: float = 1
     fail_on_invalid_query: bool = True
+
     #: enables retiming trajectory optimization, useful for getting low jerk trajectories.
     enable_finetune_trajopt: bool = True
-    parallel_finetune: bool = False
+    parallel_finetune: bool = True
+
+    #: use start config as regularization
+    use_start_state_as_retract: bool = True
+
+    pose_cost_metric: Optional[PoseCostMetric] = None
 
     def __post_init__(self):
         if not self.enable_opt and not self.enable_graph:
@@ -805,6 +864,10 @@ class MotionGenPlanConfig:
             fail_on_invalid_query=self.fail_on_invalid_query,
             enable_finetune_trajopt=self.enable_finetune_trajopt,
             parallel_finetune=self.parallel_finetune,
+            use_start_state_as_retract=self.use_start_state_as_retract,
+            pose_cost_metric=(
+                None if self.pose_cost_metric is None else self.pose_cost_metric.clone()
+            ),
         )
 
 
@@ -819,6 +882,9 @@ class MotionGen(MotionGenConfig):
         self._batch_graph_search_buffer = None
         self._batch_path_buffer_last_tstep = None
         self._rollout_list = None
+        self._solver_rollout_list = None
+        self._pose_solver_rollout_list = None
+
         self._kin_list = None
         self.update_batch_size(seeds=self.trajopt_seeds)
 
@@ -1022,6 +1088,18 @@ class MotionGen(MotionGenConfig):
         link_poses: List[Pose] = None,
     ):
         start_time = time.time()
+
+        if plan_config.pose_cost_metric is not None:
+            valid_query = self.update_pose_cost_metric(
+                plan_config.pose_cost_metric, start_state, goal_pose
+            )
+            if not valid_query:
+                result = MotionGenResult(
+                    success=torch.as_tensor([False], device=self.tensor_args.device),
+                    valid_query=valid_query,
+                    status="Invalid Hold Partial Pose",
+                )
+                return result
         # if plan_config.enable_opt:
         self.update_batch_size(seeds=solve_state.num_trajopt_seeds, batch=solve_state.batch_size)
         if solve_state.batch_env:
@@ -1097,6 +1175,8 @@ class MotionGen(MotionGenConfig):
         result.trajopt_attempts = time_dict["trajopt_attempts"]
         result.attempts = n + 1
         torch.cuda.synchronize()
+        if plan_config.pose_cost_metric is not None:
+            self.update_pose_cost_metric(PoseCostMetric.reset_metric())
         result.total_time = time.time() - start_time
         return result
 
@@ -1109,6 +1189,21 @@ class MotionGen(MotionGenConfig):
     ):
         start_time = time.time()
 
+        if plan_config.pose_cost_metric is not None:
+            valid_query = self.update_pose_cost_metric(
+                plan_config.pose_cost_metric, start_state, goal_pose
+            )
+            if not valid_query:
+                result = MotionGenResult(
+                    success=torch.as_tensor(
+                        [False for _ in solve_state.batch_size],
+                        device=self.motion_gen.tensor_args.device,
+                    ),
+                    valid_query=valid_query,
+                    status="Invalid Hold Partial Pose",
+                )
+                return result
+
         if solve_state.batch_env:
             if solve_state.batch_size > self.world_coll_checker.n_envs:
                 raise ValueError("Batch Env is less that goal batch")
@@ -1144,9 +1239,6 @@ class MotionGen(MotionGenConfig):
                 solve_state, start_state, goal_pose, plan_config
             )
 
-            # torch.cuda.synchronize()
-            # print("Success: ",torch.count_nonzero(result.success))
-            # print(result.success)
             time_dict["solve_time"] += result.solve_time
             time_dict["ik_time"] += result.ik_time
 
@@ -1161,7 +1253,6 @@ class MotionGen(MotionGenConfig):
             else:
                 # get success idx:
                 idx = torch.nonzero(result.success).reshape(-1)
-                # print(idx, result.optimized_plan.shape, best_result.optimized_plan.shape)
                 if len(idx) > 0:
                     best_result.copy_idx(idx, result)
 
@@ -1208,6 +1299,8 @@ class MotionGen(MotionGenConfig):
         best_result.trajopt_attempts = time_dict["trajopt_attempts"]
         best_result.attempts = n + 1
         torch.cuda.synchronize()
+        if plan_config.pose_cost_metric is not None:
+            self.update_pose_cost_metric(PoseCostMetric.reset_metric())
         best_result.total_time = time.time() - start_time
         return best_result
 
@@ -1236,6 +1329,7 @@ class MotionGen(MotionGenConfig):
         start_state: JointState,
         goal_pose: Pose,
         plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+        link_poses: List[Pose] = None,
     ) -> MotionGenResult:
         solve_state = self._get_solve_state(
             ReacherSolveType.GOALSET, plan_config, goal_pose, start_state
@@ -1246,6 +1340,7 @@ class MotionGen(MotionGenConfig):
             start_state,
             goal_pose,
             plan_config,
+            link_poses=link_poses,
         )
         return result
 
@@ -1393,8 +1488,6 @@ class MotionGen(MotionGenConfig):
 
                 result.used_graph = True
                 if plan_config.enable_opt:
-                    # print(result.graph_plan.position.shape, interpolation_steps,
-                    # graph_result.path_buffer_last_tstep)
                     trajopt_seed = (
                         result.graph_plan.position.view(
                             1,  # solve_state.batch_size,
@@ -1451,16 +1544,19 @@ class MotionGen(MotionGenConfig):
                 if plan_config.need_graph_success:
                     return result
 
-        # do trajopt:
+        # do trajopt::
 
         if plan_config.enable_opt:
             with profiler.record_function("motion_gen/setup_trajopt_seeds"):
                 self._trajopt_goal_config[:, :ik_success] = goal_config
-
+                retract_config = None
+                if plan_config.use_start_state_as_retract:
+                    retract_config = start_state.position.clone()
                 goal = Goal(
                     goal_pose=goal_pose,
                     current_state=start_state,
                     links_goal_pose=link_poses,
+                    retract_state=retract_config,
                 )
 
                 if (
@@ -1498,9 +1594,9 @@ class MotionGen(MotionGenConfig):
                                 device=self.tensor_args.device,
                                 dtype=torch.bool,
                             )
-                            trajopt_seed_success_new[
-                                0, : trajopt_seed_success.shape[1]
-                            ] = trajopt_seed_success
+                            trajopt_seed_success_new[0, : trajopt_seed_success.shape[1]] = (
+                                trajopt_seed_success
+                            )
                             trajopt_seed_success = trajopt_seed_success_new
                     # create seeds here:
                     trajopt_seed_traj = self.trajopt_solver.get_seed_set(
@@ -1527,11 +1623,11 @@ class MotionGen(MotionGenConfig):
                     trajopt_seed_traj,
                     num_seeds_override=solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds,
                     newton_iters=trajopt_newton_iters,
-                    return_all_solutions=plan_config.parallel_finetune,
+                    return_all_solutions=plan_config.parallel_finetune
+                    and plan_config.enable_finetune_trajopt,
                 )
             if plan_config.enable_finetune_trajopt:
                 self.trajopt_solver.interpolation_type = og_value
-                # self.trajopt_solver.compute_metrics(not og_evaluate, og_evaluate)
             if self.store_debug_in_result:
                 result.debug_info["trajopt_result"] = traj_result
             # run finetune
@@ -1578,15 +1674,13 @@ class MotionGen(MotionGenConfig):
             result.interpolated_plan = traj_result.interpolated_solution.trim_trajectory(
                 0, traj_result.path_buffer_last_tstep[0]
             )
-            # print(ik_result.position_error[ik_result.success] * 1000.0)
-            # print(traj_result.position_error * 1000.0)
-            # exit()
             result.interpolation_dt = self.trajopt_solver.interpolation_dt
             result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
             result.position_error = traj_result.position_error
             result.rotation_error = traj_result.rotation_error
             result.optimized_dt = traj_result.optimized_dt
             result.optimized_plan = traj_result.solution
+            result.goalset_index = traj_result.goalset_index
         return result
 
     def _plan_js_from_solve_state(
@@ -1705,9 +1799,9 @@ class MotionGen(MotionGenConfig):
                                 device=self.tensor_args.device,
                                 dtype=torch.bool,
                             )
-                            trajopt_seed_success_new[
-                                0, : trajopt_seed_success.shape[1]
-                            ] = trajopt_seed_success
+                            trajopt_seed_success_new[0, : trajopt_seed_success.shape[1]] = (
+                                trajopt_seed_success
+                            )
                             trajopt_seed_success = trajopt_seed_success_new
                     # create seeds here:
                     trajopt_seed_traj = self.js_trajopt_solver.get_seed_set(
@@ -1784,14 +1878,13 @@ class MotionGen(MotionGenConfig):
             result.interpolated_plan = traj_result.interpolated_solution.trim_trajectory(
                 0, traj_result.path_buffer_last_tstep[0]
             )
-            # print(ik_result.position_error[ik_result.success] * 1000.0)
-            # print(traj_result.position_error * 1000.0)
-            # exit()
+
             result.interpolation_dt = self.trajopt_solver.interpolation_dt
             result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
             result.cspace_error = traj_result.cspace_error
             result.optimized_dt = traj_result.optimized_dt
             result.optimized_plan = traj_result.solution
+            result.goalset_index = traj_result.goalset_index
 
         return result
 
@@ -1809,7 +1902,6 @@ class MotionGen(MotionGenConfig):
         graph_success = 0
 
         # plan ik:
-
         ik_result = self._solve_ik_from_solve_state(
             goal_pose,
             solve_state,
@@ -1829,6 +1921,7 @@ class MotionGen(MotionGenConfig):
             ik_time=ik_result.solve_time,
             solve_time=ik_result.solve_time,
             debug_info={},
+            # goalset_index=ik_result.goalset_index,
         )
 
         ik_success = torch.count_nonzero(ik_result.success)
@@ -1883,7 +1976,7 @@ class MotionGen(MotionGenConfig):
                         dtype=self.tensor_args.dtype,
                     )
                     trajopt_seed_traj[0, :, :interpolation_steps, :] = trajopt_seed
-
+                    trajopt_seed_traj = trajopt_seed_traj.transpose(0, 1).contiguous()
                     trajopt_seed_success = ik_result.success.clone()
                     trajopt_seed_success[ik_result.success] = graph_result.success
 
@@ -1910,9 +2003,9 @@ class MotionGen(MotionGenConfig):
                     result.interpolated_plan = self._batch_graph_search_buffer
                     g_dim = g_dim.cpu().squeeze().tolist()
                     for x, x_val in enumerate(g_dim):
-                        self._batch_path_buffer_last_tstep[
-                            x_val
-                        ] = graph_result.path_buffer_last_tstep[x]
+                        self._batch_path_buffer_last_tstep[x_val] = (
+                            graph_result.path_buffer_last_tstep[x]
+                        )
                     result.path_buffer_last_tstep = self._batch_path_buffer_last_tstep
                     result.optimized_plan = result.interpolated_plan
                     result.optimized_dt = torch.as_tensor(
@@ -1948,16 +2041,16 @@ class MotionGen(MotionGenConfig):
                 current_state=start_state,
             )
             # generate seeds:
-            if trajopt_seed_traj is None or graph_success < solve_state.num_trajopt_seeds:
+            if trajopt_seed_traj is None or (
+                plan_config.enable_graph and graph_success < solve_state.batch_size
+            ):
                 seed_goal = Goal(
                     goal_pose=goal_pose.repeat_seeds(solve_state.num_trajopt_seeds),
                     current_state=start_state.repeat_seeds(solve_state.num_trajopt_seeds),
                     goal_state=JointState.from_position(goal_config.view(-1, self._dof)),
                 )
                 if trajopt_seed_traj is not None:
-                    trajopt_seed_traj = (
-                        trajopt_seed_traj.transpose(0, 1).contiguous(),
-                    )  # batch, num_seeds, h, dof
+                    trajopt_seed_traj = trajopt_seed_traj.transpose(0, 1).contiguous()
 
                 # create seeds here:
                 trajopt_seed_traj = self.trajopt_solver.get_seed_set(
@@ -1976,6 +2069,7 @@ class MotionGen(MotionGenConfig):
             if plan_config.enable_finetune_trajopt:
                 og_value = self.trajopt_solver.interpolation_type
                 self.trajopt_solver.interpolation_type = InterpolateType.LINEAR_CUDA
+
             traj_result = self._solve_trajopt_from_solve_state(
                 goal,
                 solve_state,
@@ -2005,6 +2099,7 @@ class MotionGen(MotionGenConfig):
                         self.trajopt_solver.interpolation_dt,
                     )
                     self.finetune_trajopt_solver.update_solver_dt(scaled_dt.item())
+
                     traj_result = self._solve_trajopt_from_solve_state(
                         goal,
                         solve_state,
@@ -2026,6 +2121,7 @@ class MotionGen(MotionGenConfig):
             result.position_error = traj_result.position_error
             result.rotation_error = traj_result.rotation_error
             result.cspace_error = traj_result.cspace_error
+            result.goalset_index = traj_result.goalset_index
             result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
             result.optimized_plan = traj_result.solution
             result.optimized_dt = traj_result.optimized_dt
@@ -2034,7 +2130,6 @@ class MotionGen(MotionGenConfig):
                 result.success[:] = False
             if self.store_debug_in_result:
                 result.debug_info = {"trajopt_result": traj_result}
-
         return result
 
     def compute_kinematics(self, state: JointState) -> KinematicModelState:
@@ -2054,7 +2149,7 @@ class MotionGen(MotionGenConfig):
         return metrics
 
     def update_world(self, world: WorldConfig):
-        self.world_coll_checker.load_collision_model(world)
+        self.world_coll_checker.load_collision_model(world, fix_cache_reference=self.use_cuda_graph)
         self.graph_planner.reset_graph()
         return True
 
@@ -2081,9 +2176,32 @@ class MotionGen(MotionGenConfig):
         batch: Optional[int] = None,
         warmup_js_trajopt: bool = True,
         batch_env_mode: bool = False,
-        parallel_finetune: bool = False,
+        parallel_finetune: bool = True,
+        n_goalset: int = -1,
+        warmup_joint_index: int = 0,
+        warmup_joint_delta: float = 0.1,
     ):
         log_info("Warmup")
+        if warmup_js_trajopt:
+            start_state = JointState.from_position(
+                self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
+                joint_names=self.rollout_fn.joint_names,
+            )
+            # warm up js_trajopt:
+            goal_state = start_state.clone()
+            goal_state.position[..., warmup_joint_index] += warmup_joint_delta
+            for _ in range(2):
+                self.plan_single_js(start_state, goal_state, MotionGenPlanConfig(max_attempts=1))
+        if enable_graph:
+            start_state = JointState.from_position(
+                self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
+                joint_names=self.rollout_fn.joint_names,
+            )
+            start_state.position[..., warmup_joint_index] += warmup_joint_delta
+            self.graph_planner.warmup(
+                self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
+                start_state.position,
+            )
 
         if batch is None:
             start_state = JointState.from_position(
@@ -2092,25 +2210,21 @@ class MotionGen(MotionGenConfig):
             )
             state = self.rollout_fn.compute_kinematics(start_state)
             link_poses = state.link_pose
-            retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
-            start_state.position += 0.1
 
-            for _ in range(2):
-                self.plan_single(
-                    start_state,
-                    retract_pose,
-                    MotionGenPlanConfig(
-                        max_attempts=1,
-                        enable_finetune_trajopt=True,
-                        parallel_finetune=parallel_finetune,
-                    ),
-                    link_poses=link_poses,
-                )
-            if enable_graph:
-                self.graph_planner.warmup(
-                    self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
-                    start_state.position,
-                )
+            if n_goalset == -1:
+                retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+                start_state.position[..., warmup_joint_index] += warmup_joint_delta
+                for _ in range(2):
+                    self.plan_single(
+                        start_state,
+                        retract_pose,
+                        MotionGenPlanConfig(
+                            max_attempts=1,
+                            enable_finetune_trajopt=True,
+                            parallel_finetune=parallel_finetune,
+                        ),
+                        link_poses=link_poses,
+                    )
 
                 self.plan_single(
                     start_state,
@@ -2123,41 +2237,100 @@ class MotionGen(MotionGenConfig):
                     ),
                     link_poses=link_poses,
                 )
-            if warmup_js_trajopt:
-                # warm up js_trajopt:
-                goal_state = start_state.clone()
-                goal_state.position += 0.2
+            else:
+                retract_pose = Pose(
+                    state.ee_pos_seq.repeat(n_goalset, 1).view(1, n_goalset, 3),
+                    quaternion=state.ee_quat_seq.repeat(n_goalset, 1).view(1, n_goalset, 4),
+                )
+                start_state.position[..., warmup_joint_index] += warmup_joint_delta
                 for _ in range(2):
-                    self.plan_single_js(
-                        start_state, goal_state, MotionGenPlanConfig(max_attempts=1)
+                    self.plan_goalset(
+                        start_state,
+                        retract_pose,
+                        MotionGenPlanConfig(
+                            max_attempts=1,
+                            enable_finetune_trajopt=True,
+                            parallel_finetune=parallel_finetune,
+                        ),
+                        link_poses=link_poses,
                     )
 
+                self.plan_goalset(
+                    start_state,
+                    retract_pose,
+                    MotionGenPlanConfig(
+                        max_attempts=1,
+                        enable_finetune_trajopt=True,
+                        enable_graph=enable_graph,
+                        parallel_finetune=parallel_finetune,
+                    ),
+                    link_poses=link_poses,
+                )
+
         else:
             start_state = JointState.from_position(
                 self.get_retract_config().view(1, -1).clone(),
                 joint_names=self.rollout_fn.joint_names,
             ).repeat_seeds(batch)
             state = self.rollout_fn.compute_kinematics(start_state)
-            retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
-            start_state.position += 0.1
 
-            for _ in range(3):
-                if batch_env_mode:
-                    self.plan_batch_env(
-                        start_state,
-                        retract_pose,
-                        MotionGenPlanConfig(
-                            max_attempts=1, enable_finetune_trajopt=True, enable_graph=enable_graph
-                        ),
-                    )
-                else:
-                    self.plan_batch(
-                        start_state,
-                        retract_pose,
-                        MotionGenPlanConfig(
-                            max_attempts=1, enable_finetune_trajopt=True, enable_graph=enable_graph
-                        ),
-                    )
+            if n_goalset == -1:
+                retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+                start_state.position[..., warmup_joint_index] += warmup_joint_delta
+
+                for _ in range(2):
+                    if batch_env_mode:
+                        self.plan_batch_env(
+                            start_state,
+                            retract_pose,
+                            MotionGenPlanConfig(
+                                max_attempts=10,
+                                enable_finetune_trajopt=True,
+                                enable_graph=enable_graph,
+                            ),
+                        )
+                    else:
+                        self.plan_batch(
+                            start_state,
+                            retract_pose,
+                            MotionGenPlanConfig(
+                                max_attempts=10,
+                                enable_finetune_trajopt=True,
+                                enable_graph=enable_graph,
+                            ),
+                        )
+            else:
+                retract_pose = Pose(
+                    state.ee_pos_seq.view(batch, 1, 3).repeat(1, n_goalset, 1).contiguous(),
+                    quaternion=state.ee_quat_seq.view(batch, 1, 4)
+                    .repeat(1, n_goalset, 1)
+                    .contiguous(),
+                )
+                start_state.position[..., warmup_joint_index] += warmup_joint_delta
+                for _ in range(2):
+                    if batch_env_mode:
+                        self.plan_batch_env_goalset(
+                            start_state,
+                            retract_pose,
+                            MotionGenPlanConfig(
+                                max_attempts=10,
+                                enable_finetune_trajopt=True,
+                                enable_graph=enable_graph,
+                                enable_graph_attempt=20,
+                            ),
+                        )
+                    else:
+                        self.plan_batch_goalset(
+                            start_state,
+                            retract_pose,
+                            MotionGenPlanConfig(
+                                max_attempts=10,
+                                enable_finetune_trajopt=True,
+                                enable_graph=enable_graph,
+                                enable_graph_attempt=20,
+                            ),
+                        )
+
         log_info("Warmup complete")
         return True
 
@@ -2200,6 +2373,7 @@ class MotionGen(MotionGenConfig):
                 break
 
         result.solve_time = time_dict["trajopt_time"]
+        result.total_time = result.solve_time
         if self.store_debug_in_result:
             result.debug_info = {"trajopt_result": traj_result}
 
@@ -2302,15 +2476,92 @@ class MotionGen(MotionGenConfig):
         opt_js = in_js.get_ordered_joint_state(opt_jnames)
         return opt_js
 
+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+        start_state: Optional[JointState] = None,
+        goal_pose: Optional[Pose] = None,
+    ) -> bool:
+        # check if constraint is valid:
+        if metric.hold_partial_pose and metric.offset_tstep_fraction < 0.0:
+            start_pose = self.compute_kinematics(start_state).ee_pose.clone()
+            if self.project_pose_to_goal_frame:
+                # project start pose to goal frame:
+                projected_pose = goal_pose.compute_local_pose(start_pose)
+                if torch.count_nonzero(metric.hold_vec_weight[:3] > 0.0) > 0:
+                    # angular distance should be zero:
+                    distance = projected_pose.angular_distance(
+                        Pose.from_list([0, 0, 0, 1, 0, 0, 0], tensor_args=self.tensor_args)
+                    )
+                    if torch.max(distance) > 0.05:
+                        log_warn(
+                            "Partial orientation between start and goal is not equal"
+                            + str(distance)
+                        )
+                        return False
+
+                # check linear distance:
+                if (
+                    torch.count_nonzero(
+                        torch.abs(projected_pose.position[..., metric.hold_vec_weight[3:] > 0.0])
+                        > 0.005
+                    )
+                    > 0
+                ):
+                    log_warn("Partial position between start and goal is not equal.")
+                    return False
+            else:
+                # project start pose to goal frame:
+                projected_position = goal_pose.position - start_pose.position
+                # check linear distance:
+                if (
+                    torch.count_nonzero(
+                        torch.abs(projected_position[..., metric.hold_vec_weight[3:] > 0.0]) > 0.005
+                    )
+                    > 0
+                ):
+                    log_warn("Partial position between start and goal is not equal.")
+                    return False
+
+        rollouts = self.get_all_pose_solver_rollout_instances()
+        [
+            rollout.update_pose_cost_metric(metric)
+            for rollout in rollouts
+            if isinstance(rollout, ArmReacher)
+        ]
+        torch.cuda.synchronize()
+        return True
+
     def get_all_rollout_instances(self) -> List[RolloutBase]:
         if self._rollout_list is None:
             self._rollout_list = (
                 self.ik_solver.get_all_rollout_instances()
                 + self.graph_planner.get_all_rollout_instances()
                 + self.trajopt_solver.get_all_rollout_instances()
+                + self.finetune_trajopt_solver.get_all_rollout_instances()
+                + self.js_trajopt_solver.get_all_rollout_instances()
             )
         return self._rollout_list
 
+    def get_all_solver_rollout_instances(self) -> List[RolloutBase]:
+        if self._solver_rollout_list is None:
+            self._solver_rollout_list = (
+                self.ik_solver.solver.get_all_rollout_instances()
+                + self.trajopt_solver.solver.get_all_rollout_instances()
+                + self.finetune_trajopt_solver.solver.get_all_rollout_instances()
+                + self.js_trajopt_solver.solver.get_all_rollout_instances()
+            )
+        return self._solver_rollout_list
+
+    def get_all_pose_solver_rollout_instances(self) -> List[RolloutBase]:
+        if self._pose_solver_rollout_list is None:
+            self._pose_solver_rollout_list = (
+                self.ik_solver.solver.get_all_rollout_instances()
+                + self.trajopt_solver.solver.get_all_rollout_instances()
+                + self.finetune_trajopt_solver.solver.get_all_rollout_instances()
+            )
+        return self._pose_solver_rollout_list
+
     def get_all_kinematics_instances(self) -> List[CudaRobotModel]:
         if self._kin_list is None:
             self._kin_list = [
@@ -2385,6 +2636,65 @@ class MotionGen(MotionGenConfig):
 
         self.attach_spheres_to_robot(sphere_tensor=sphere_tensor, link_name=link_name)
 
+    def attach_external_objects_to_robot(
+        self,
+        joint_state: JointState,
+        external_objects: List[Obstacle],
+        surface_sphere_radius: float = 0.001,
+        link_name: str = "attached_object",
+        sphere_fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        world_objects_pose_offset: Optional[Pose] = None,
+    ) -> None:
+        """Attach an object from world to robot's link.
+
+        Args:
+            surface_sphere_radius: _description_. Defaults to None.
+            sphere_tensor: _description_. Defaults to None.
+            link_name: _description_. Defaults to "attached_object".
+        """
+        log_info("MG: Attach objects to robot")
+        if len(external_objects) == 0:
+            log_error("no object in external_objects")
+        kin_state = self.compute_kinematics(joint_state)
+        ee_pose = kin_state.ee_pose  # w_T_ee
+        if world_objects_pose_offset is not None:
+            # add offset from ee:
+            ee_pose = world_objects_pose_offset.inverse().multiply(ee_pose)
+            # new ee_pose:
+            # w_T_ee = offset_T_w * w_T_ee
+            # ee_T_w
+        ee_pose = ee_pose.inverse()  # ee_T_w to multiply all objects later
+        max_spheres = self.robot_cfg.kinematics.kinematics_config.get_number_of_spheres(link_name)
+        object_names = [x.name for x in external_objects]
+        n_spheres = int(max_spheres / len(object_names))
+        sphere_tensor = torch.zeros((max_spheres, 4))
+        sphere_tensor[:, 3] = -10.0
+        sph_list = []
+        if n_spheres == 0:
+            log_error("MG: No spheres found")
+        for i, x in enumerate(object_names):
+            obs = external_objects[i]
+            sph = obs.get_bounding_spheres(
+                n_spheres,
+                surface_sphere_radius,
+                pre_transform_pose=ee_pose,
+                tensor_args=self.tensor_args,
+                fit_type=sphere_fit_type,
+                voxelize_method=voxelize_method,
+            )
+            sph_list += [s.position + [s.radius] for s in sph]
+
+        log_info("MG: Computed spheres for attach objects to robot")
+
+        spheres = self.tensor_args.to_device(torch.as_tensor(sph_list))
+
+        if spheres.shape[0] > max_spheres:
+            spheres = spheres[: spheres.shape[0]]
+        sphere_tensor[: spheres.shape[0], :] = spheres.contiguous()
+
+        self.attach_spheres_to_robot(sphere_tensor=sphere_tensor, link_name=link_name)
+
     def add_camera_frame(self, camera_observation: CameraObservation, obstacle_name: str):
         self.world_coll_checker.add_camera_frame(camera_observation, obstacle_name)
 
@@ -2616,6 +2926,10 @@ class MotionGen(MotionGenConfig):
     def world_collision(self) -> WorldCollision:
         return self.world_coll_checker
 
+    @property
+    def project_pose_to_goal_frame(self) -> bool:
+        return self.trajopt_solver.rollout_fn.goal_cost.project_distance
+
     def update_interpolation_type(
         self,
         interpolation_type: InterpolateType,
@@ -2629,7 +2943,22 @@ class MotionGen(MotionGenConfig):
             self.finetune_trajopt_solver.interpolation_type = interpolation_type
             self.js_trajopt_solver.interpolation_type = interpolation_type
 
-    def update_locked_state(js: JointState):
-        # update the joint values of locked joints:
+    def update_locked_joints(
+        self, lock_joints: Dict[str, float], robot_config_dict: Union[str, Dict[Any]]
+    ):
+        if isinstance(robot_config_dict, str):
+            robot_config_dict = load_yaml(join_path(get_robot_configs_path(), robot_config_dict))[
+                "robot_cfg"
+            ]
+        if "robot_cfg" in robot_config_dict:
+            robot_config_dict = robot_config_dict["robot_cfg"]
+        robot_config_dict["kinematics"]["lock_joints"] = lock_joints
+        robot_cfg = RobotConfig.from_dict(robot_config_dict, self.tensor_args)
 
-        pass
+        # make sure the new robot config and current have the same joint limits:
+        new_joint_limits = robot_cfg.kinematics.get_joint_limits()
+        current_joint_limits = self.robot_cfg.kinematics.get_joint_limits()
+        # if new_joint_limits != current_joint_limits:
+        #    log_error("Joint limits are different, reinstance motion gen")
+
+        self.kinematics.update_kinematics_config(robot_cfg.kinematics.kinematics_config)
diff --git a/src/curobo/wrap/reacher/mpc.py b/src/curobo/wrap/reacher/mpc.py
index 1a767c9..378756d 100644
--- a/src/curobo/wrap/reacher/mpc.py
+++ b/src/curobo/wrap/reacher/mpc.py
@@ -80,7 +80,7 @@ class MpcSolverConfig:
 
         task_file = "particle_mpc.yml"
         config_data = load_yaml(join_path(get_task_configs_path(), task_file))
-        config_data["mppi"]["n_envs"] = 1
+        config_data["mppi"]["n_problems"] = 1
         if step_dt is not None:
             config_data["model"]["dt_traj_params"]["base_dt"] = step_dt
         if particle_opt_iters is not None:
@@ -238,7 +238,7 @@ class MpcSolver(MpcSolverConfig):
             self.tensor_args,
         )
         if update_reference:
-            self.solver.update_nenvs(self._solve_state.get_batch_size())
+            self.solver.update_nproblems(self._solve_state.get_batch_size())
             self.reset()
             self.reset_cuda_graph()
             self._col = torch.arange(
diff --git a/src/curobo/wrap/reacher/trajopt.py b/src/curobo/wrap/reacher/trajopt.py
index c4b714e..5e3bc78 100644
--- a/src/curobo/wrap/reacher/trajopt.py
+++ b/src/curobo/wrap/reacher/trajopt.py
@@ -30,6 +30,7 @@ from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
 from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
 from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
 from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.cost.pose_cost import PoseCostMetric
 from curobo.rollout.dynamics_model.integration_utils import (
     action_interpolate_kernel,
     interpolate_kernel,
@@ -39,7 +40,7 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.robot import JointState, RobotConfig
 from curobo.types.tensor import T_BDOF, T_DOF, T_BValue_bool, T_BValue_float
 from curobo.util.helpers import list_idx_if_not_none
-from curobo.util.logger import log_info, log_warn
+from curobo.util.logger import log_error, log_info, log_warn
 from curobo.util.trajectory import (
     InterpolateType,
     calculate_dt_no_clamp,
@@ -78,6 +79,7 @@ class TrajOptSolverConfig:
     trim_steps: Optional[List[int]] = None
     store_debug_in_result: bool = False
     optimize_dt: bool = True
+    use_cuda_graph: bool = True
 
     @staticmethod
     @profiler.record_function("trajopt_config/load_from_robot_config")
@@ -98,14 +100,14 @@ class TrajOptSolverConfig:
         interpolation_type: InterpolateType = InterpolateType.LINEAR_CUDA,
         interpolation_steps: int = 10000,
         interpolation_dt: float = 0.01,
-        use_cuda_graph: Optional[bool] = None,
+        use_cuda_graph: bool = True,
         self_collision_check: bool = False,
         self_collision_opt: bool = True,
         grad_trajopt_iters: Optional[int] = None,
         num_seeds: int = 2,
         seed_ratio: Dict[str, int] = {"linear": 1.0, "bias": 0.0, "start": 0.0, "end": 0.0},
         use_particle_opt: bool = True,
-        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.MESH,
         traj_evaluator_config: TrajEvaluatorConfig = TrajEvaluatorConfig(),
         traj_evaluator: Optional[TrajEvaluator] = None,
         minimize_jerk: bool = True,
@@ -128,6 +130,7 @@ class TrajOptSolverConfig:
         state_finite_difference_mode: Optional[str] = None,
         filter_robot_command: bool = False,
         optimize_dt: bool = True,
+        project_pose_to_goal_frame: bool = True,
     ):
         # NOTE: Don't have default optimize_dt, instead read from a configuration file.
         # use default values, disable environment collision checking
@@ -163,6 +166,16 @@ class TrajOptSolverConfig:
         if traj_tsteps is None:
             traj_tsteps = grad_config_data["model"]["horizon"]
 
+        base_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        base_config_data["convergence"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        grad_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+
+        base_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        base_config_data["convergence"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        grad_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+
         config_data["model"]["horizon"] = traj_tsteps
         grad_config_data["model"]["horizon"] = traj_tsteps
         if minimize_jerk is not None:
@@ -212,8 +225,8 @@ class TrajOptSolverConfig:
         if not self_collision_opt:
             config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
             grad_config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
-        config_data["mppi"]["n_envs"] = 1
-        grad_config_data["lbfgs"]["n_envs"] = 1
+        config_data["mppi"]["n_problems"] = 1
+        grad_config_data["lbfgs"]["n_problems"] = 1
 
         if fixed_iters is not None:
             grad_config_data["lbfgs"]["fixed_iters"] = fixed_iters
@@ -256,9 +269,10 @@ class TrajOptSolverConfig:
             world_coll_checker=world_coll_checker,
             tensor_args=tensor_args,
         )
-
+        arm_rollout_mppi = None
         with profiler.record_function("trajopt_config/create_rollouts"):
-            arm_rollout_mppi = ArmReacher(cfg)
+            if use_particle_opt:
+                arm_rollout_mppi = ArmReacher(cfg)
             arm_rollout_grad = ArmReacher(grad_cfg)
 
             arm_rollout_safety = ArmReacher(safety_cfg)
@@ -266,20 +280,22 @@ class TrajOptSolverConfig:
                 aux_rollout = ArmReacher(safety_cfg)
             interpolate_rollout = ArmReacher(safety_cfg)
         if trajopt_dt is not None:
-            arm_rollout_mppi.update_traj_dt(dt=trajopt_dt)
+            if arm_rollout_mppi is not None:
+                arm_rollout_mppi.update_traj_dt(dt=trajopt_dt)
             aux_rollout.update_traj_dt(dt=trajopt_dt)
             arm_rollout_grad.update_traj_dt(dt=trajopt_dt)
             arm_rollout_safety.update_traj_dt(dt=trajopt_dt)
-
-        config_dict = ParallelMPPIConfig.create_data_dict(
-            config_data["mppi"], arm_rollout_mppi, tensor_args
-        )
+        if arm_rollout_mppi is not None:
+            config_dict = ParallelMPPIConfig.create_data_dict(
+                config_data["mppi"], arm_rollout_mppi, tensor_args
+            )
+        parallel_mppi = None
         if use_es is not None and use_es:
             mppi_cfg = ParallelESConfig(**config_dict)
             if es_learning_rate is not None:
                 mppi_cfg.learning_rate = es_learning_rate
             parallel_mppi = ParallelES(mppi_cfg)
-        else:
+        elif use_particle_opt:
             mppi_cfg = ParallelMPPIConfig(**config_dict)
             parallel_mppi = ParallelMPPI(mppi_cfg)
 
@@ -307,7 +323,7 @@ class TrajOptSolverConfig:
         cfg = WrapConfig(
             safety_rollout=arm_rollout_safety,
             optimizers=opt_list,
-            compute_metrics=not evaluate_interpolated_trajectory,
+            compute_metrics=True,  # not evaluate_interpolated_trajectory,
             use_cuda_graph_metrics=grad_config_data["lbfgs"]["use_cuda_graph"],
             sync_cuda_time=sync_cuda_time,
         )
@@ -337,6 +353,7 @@ class TrajOptSolverConfig:
             trim_steps=trim_steps,
             store_debug_in_result=store_debug_in_result,
             optimize_dt=optimize_dt,
+            use_cuda_graph=use_cuda_graph,
         )
         return trajopt_cfg
 
@@ -360,6 +377,7 @@ class TrajResult(Sequence):
     optimized_dt: Optional[torch.Tensor] = None
     raw_solution: Optional[JointState] = None
     raw_action: Optional[torch.Tensor] = None
+    goalset_index: Optional[torch.Tensor] = None
 
     def __getitem__(self, idx):
         # position_error = rotation_error = cspace_error = path_buffer_last_tstep = None
@@ -372,6 +390,7 @@ class TrajResult(Sequence):
             self.position_error,
             self.rotation_error,
             self.cspace_error,
+            self.goalset_index,
         ]
         idx_vals = list_idx_if_not_none(d_list, idx)
 
@@ -388,6 +407,7 @@ class TrajResult(Sequence):
             position_error=idx_vals[3],
             rotation_error=idx_vals[4],
             cspace_error=idx_vals[5],
+            goalset_index=idx_vals[6],
         )
 
     def __len__(self):
@@ -405,7 +425,7 @@ class TrajOptSolver(TrajOptSolverConfig):
             3, int(self.action_horizon / 2), self.tensor_args
         ).unsqueeze(0)
         assert self.action_horizon / 2 != 0.0
-        self.solver.update_nenvs(self.num_seeds)
+        self.solver.update_nproblems(self.num_seeds)
         self._max_joint_vel = (
             self.solver.safety_rollout.state_bounds.velocity.view(2, self.dof)[1, :].reshape(
                 1, 1, self.dof
@@ -469,12 +489,18 @@ class TrajOptSolver(TrajOptSolverConfig):
             self._goal_buffer,
             self.tensor_args,
         )
+
         if update_reference:
-            self.solver.update_nenvs(self._solve_state.get_batch_size())
-            self.reset_cuda_graph()
+            if self.use_cuda_graph and self._col is not None:
+                log_error("changing goal type, breaking previous cuda graph.")
+                self.reset_cuda_graph()
+
+            self.solver.update_nproblems(self._solve_state.get_batch_size())
             self._col = torch.arange(
                 0, goal.batch, device=self.tensor_args.device, dtype=torch.long
             )
+            self.reset_shape()
+
         return self._goal_buffer
 
     def solve_any(
@@ -586,6 +612,8 @@ class TrajOptSolver(TrajOptSolverConfig):
             num_seeds,
             solve_state.batch_mode,
         )
+        if traj_result.goalset_index is not None:
+            traj_result.goalset_index[traj_result.goalset_index >= goal.goal_pose.n_goalset] = 0
         if newton_iters is not None:
             self.solver.newton_optimizer.outer_iters = self._og_newton_iters
             self.solver.newton_optimizer.fixed_iters = self._og_newton_fixed_iters
@@ -839,16 +867,18 @@ class TrajOptSolver(TrajOptSolverConfig):
         if self.evaluate_interpolated_trajectory:
             with profiler.record_function("trajopt/evaluate_interpolated"):
                 if self.use_cuda_graph_metrics:
-                    result.metrics = self.interpolate_rollout.get_metrics_cuda_graph(
-                        interpolated_trajs
-                    )
+                    metrics = self.interpolate_rollout.get_metrics_cuda_graph(interpolated_trajs)
                 else:
-                    result.metrics = self.interpolate_rollout.get_metrics(interpolated_trajs)
+                    metrics = self.interpolate_rollout.get_metrics(interpolated_trajs)
+                result.metrics.feasible = metrics.feasible
+                result.metrics.position_error = metrics.position_error
+                result.metrics.rotation_error = metrics.rotation_error
+                result.metrics.cspace_error = metrics.cspace_error
+                result.metrics.goalset_index = metrics.goalset_index
 
         st_time = time.time()
         feasible = torch.all(result.metrics.feasible, dim=-1)
-        # if self.num_seeds == 1:
-        # print(result.metrics)
+
         if result.metrics.position_error is not None:
             converge = torch.logical_and(
                 result.metrics.position_error[..., -1] <= self.position_threshold,
@@ -877,10 +907,10 @@ class TrajOptSolver(TrajOptSolverConfig):
                 optimized_dt=opt_dt,
                 raw_solution=result.action,
                 raw_action=result.raw_action,
+                goalset_index=result.metrics.goalset_index,
             )
         else:
             # get path length:
-            # max_vel =
             if self.evaluate_interpolated_trajectory:
                 smooth_label, smooth_cost = self.traj_evaluator.evaluate_interpolated_smootheness(
                     interpolated_trajs,
@@ -896,7 +926,6 @@ class TrajOptSolver(TrajOptSolverConfig):
                     self.solver.rollout_fn.dynamics_model.cspace_distance_weight,
                     self._velocity_bounds,
                 )
-            # print(smooth_label, success, self._velocity_bounds.shape, self.solver.rollout_fn.dynamics_model.cspace_distance_weight)
 
             with profiler.record_function("trajopt/best_select"):
                 success[~smooth_label] = False
@@ -907,7 +936,8 @@ class TrajOptSolver(TrajOptSolverConfig):
                     convergence_error = result.metrics.cspace_error[..., -1]
                 else:
                     raise ValueError("convergence check requires either goal_pose or goal_state")
-                error = convergence_error + smooth_cost
+                running_cost = torch.mean(result.metrics.cost, dim=-1) * 0.0001
+                error = convergence_error + smooth_cost + running_cost
                 error[~success] += 10000.0
                 if batch_mode:
                     idx = torch.argmin(error.view(goal.batch, num_seeds), dim=-1)
@@ -923,13 +953,16 @@ class TrajOptSolver(TrajOptSolverConfig):
                 best_act_seq = result.action[idx]
                 best_raw_action = result.raw_action[idx]
                 interpolated_traj = interpolated_trajs[idx]
-                position_error = rotation_error = cspace_error = None
+                goalset_index = position_error = rotation_error = cspace_error = None
                 if result.metrics.position_error is not None:
                     position_error = result.metrics.position_error[idx, -1]
                 if result.metrics.rotation_error is not None:
                     rotation_error = result.metrics.rotation_error[idx, -1]
                 if result.metrics.cspace_error is not None:
                     cspace_error = result.metrics.cspace_error[idx, -1]
+                if result.metrics.goalset_index is not None:
+                    goalset_index = result.metrics.goalset_index[idx, -1]
+
                 opt_dt = opt_dt[idx]
             if self.sync_cuda_time:
                 torch.cuda.synchronize()
@@ -965,6 +998,7 @@ class TrajOptSolver(TrajOptSolverConfig):
                 optimized_dt=opt_dt,
                 raw_solution=best_act_seq,
                 raw_action=best_raw_action,
+                goalset_index=goalset_index,
             )
         return traj_result
 
@@ -999,7 +1033,6 @@ class TrajOptSolver(TrajOptSolverConfig):
             return self.solve_batch_goalset(
                 goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success
             )
-        return traj_result
 
     def get_linear_seed(self, start_state, goal_state):
         start_q = start_state.position.view(-1, 1, self.dof)
@@ -1173,6 +1206,11 @@ class TrajOptSolver(TrajOptSolverConfig):
         self.interpolate_rollout.reset_cuda_graph()
         self.rollout_fn.reset_cuda_graph()
 
+    def reset_shape(self):
+        self.solver.reset_shape()
+        self.interpolate_rollout.reset_shape()
+        self.rollout_fn.reset_shape()
+
     @property
     def kinematics(self) -> CudaRobotModel:
         return self.rollout_fn.dynamics_model.robot_model
@@ -1205,3 +1243,14 @@ class TrajOptSolver(TrajOptSolverConfig):
 
     def get_full_js(self, active_js: JointState) -> JointState:
         return self.rollout_fn.get_full_dof_from_solution(active_js)
+
+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+    ):
+        rollouts = self.get_all_rollout_instances()
+        [
+            rollout.update_pose_cost_metric(metric)
+            for rollout in rollouts
+            if isinstance(rollout, ArmReacher)
+        ]
diff --git a/src/curobo/wrap/reacher/types.py b/src/curobo/wrap/reacher/types.py
index 15f5623..7e202a9 100644
--- a/src/curobo/wrap/reacher/types.py
+++ b/src/curobo/wrap/reacher/types.py
@@ -13,7 +13,7 @@ from __future__ import annotations
 # Standard Library
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 # CuRobo
 from curobo.rollout.rollout_base import Goal
@@ -126,16 +126,26 @@ class ReacherSolveState:
         if (
             current_solve_state is None
             or current_goal_buffer is None
-            or current_solve_state != solve_state
             or (current_goal_buffer.retract_state is None and retract_config is not None)
             or (current_goal_buffer.goal_state is None and goal_state is not None)
             or (current_goal_buffer.links_goal_pose is None and link_poses is not None)
         ):
+            update_reference = True
+
+        elif current_solve_state != solve_state:
+            new_goal_pose = get_padded_goalset(
+                solve_state, current_solve_state, current_goal_buffer, goal_pose
+            )
+            if new_goal_pose is not None:
+                goal_pose = new_goal_pose
+            else:
+                update_reference = True
+
+        if update_reference:
             current_solve_state = solve_state
             current_goal_buffer = solve_state.create_goal_buffer(
                 goal_pose, goal_state, retract_config, link_poses, tensor_args
             )
-            update_reference = True
         else:
             current_goal_buffer.goal_pose.copy_(goal_pose)
             if retract_config is not None:
@@ -145,6 +155,7 @@ class ReacherSolveState:
             if link_poses is not None:
                 for k in link_poses.keys():
                     current_goal_buffer.links_goal_pose[k].copy_(link_poses[k])
+
         return current_solve_state, current_goal_buffer, update_reference
 
     def update_goal(
@@ -155,17 +166,26 @@ class ReacherSolveState:
         tensor_args: TensorDeviceType = TensorDeviceType(),
     ):
         solve_state = self
-
         update_reference = False
         if (
             current_solve_state is None
             or current_goal_buffer is None
-            or current_solve_state != solve_state
             or (current_goal_buffer.goal_state is None and goal.goal_state is not None)
             or (current_goal_buffer.goal_state is not None and goal.goal_state is None)
         ):
-            # TODO: Check for change in update idx buffers, currently we assume
-            # that solve_state captures difference in idx buffers
+            update_reference = True
+        elif current_solve_state != solve_state:
+            new_goal_pose = get_padded_goalset(
+                solve_state, current_solve_state, current_goal_buffer, goal.goal_pose
+            )
+            if new_goal_pose is not None:
+                goal = goal.clone()
+                goal.goal_pose = new_goal_pose
+
+            else:
+                update_reference = True
+
+        if update_reference:
             current_solve_state = solve_state
             current_goal_buffer = goal.create_index_buffers(
                 solve_state.batch_size,
@@ -174,7 +194,6 @@ class ReacherSolveState:
                 solve_state.num_seeds,
                 tensor_args,
             )
-            update_reference = True
         else:
             current_goal_buffer.copy_(goal, update_idx_buffers=False)
         return current_solve_state, current_goal_buffer, update_reference
@@ -185,3 +204,92 @@ class MotionGenSolverState:
     solve_type: ReacherSolveType
     ik_solve_state: ReacherSolveState
     trajopt_solve_state: ReacherSolveState
+
+
+def get_padded_goalset(
+    solve_state: ReacherSolveState,
+    current_solve_state: ReacherSolveState,
+    current_goal_buffer: Goal,
+    new_goal_pose: Pose,
+) -> Union[Pose, None]:
+    if (
+        current_solve_state.solve_type == ReacherSolveType.GOALSET
+        and solve_state.solve_type == ReacherSolveType.SINGLE
+    ):
+        # convert single goal to goal set
+        # solve_state.solve_type = ReacherSolveType.GOALSET
+        # solve_state.n_goalset = current_solve_state.n_goalset
+
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[:] = new_goal_pose.position
+        goal_pose.quaternion[:] = new_goal_pose.quaternion
+        return goal_pose
+
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        if len(new_goal_pose.position.shape) == 2:
+            new_goal_pose = new_goal_pose.unsqueeze(1)
+        goal_pose.position[..., :, :] = new_goal_pose.position
+        goal_pose.quaternion[..., :, :] = new_goal_pose.quaternion
+        return goal_pose
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_ENV_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH_ENV
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        if len(new_goal_pose.position.shape) == 2:
+            new_goal_pose = new_goal_pose.unsqueeze(1)
+        goal_pose.position[..., :, :] = new_goal_pose.position
+        goal_pose.quaternion[..., :, :] = new_goal_pose.quaternion
+        return goal_pose
+
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.GOALSET
+        and solve_state.solve_type == ReacherSolveType.GOALSET
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[..., : new_goal_pose.n_goalset, :] = new_goal_pose.position
+        goal_pose.quaternion[..., : new_goal_pose.n_goalset, :] = new_goal_pose.quaternion
+        goal_pose.position[..., new_goal_pose.n_goalset :, :] = new_goal_pose.position[..., :1, :]
+        goal_pose.quaternion[..., new_goal_pose.n_goalset :, :] = new_goal_pose.quaternion[
+            ..., :1, :
+        ]
+
+        return goal_pose
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH_GOALSET
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[..., : new_goal_pose.n_goalset, :] = new_goal_pose.position
+        goal_pose.quaternion[..., : new_goal_pose.n_goalset, :] = new_goal_pose.quaternion
+        goal_pose.position[..., new_goal_pose.n_goalset :, :] = new_goal_pose.position[..., :1, :]
+        goal_pose.quaternion[..., new_goal_pose.n_goalset :, :] = new_goal_pose.quaternion[
+            ..., :1, :
+        ]
+        return goal_pose
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_ENV_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH_ENV_GOALSET
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[..., : new_goal_pose.n_goalset, :] = new_goal_pose.position
+        goal_pose.quaternion[..., : new_goal_pose.n_goalset, :] = new_goal_pose.quaternion
+        goal_pose.position[..., new_goal_pose.n_goalset :, :] = new_goal_pose.position[..., :1, :]
+        goal_pose.quaternion[..., new_goal_pose.n_goalset :, :] = new_goal_pose.quaternion[
+            ..., :1, :
+        ]
+        return goal_pose
+    return None
diff --git a/src/curobo/wrap/wrap_base.py b/src/curobo/wrap/wrap_base.py
index bd02a02..447d7e6 100644
--- a/src/curobo/wrap/wrap_base.py
+++ b/src/curobo/wrap/wrap_base.py
@@ -54,7 +54,7 @@ class WrapBase(WrapConfig):
     def __init__(self, config: Optional[WrapConfig] = None):
         if config is not None:
             WrapConfig.__init__(self, **vars(config))
-        self.n_envs = 1
+        self.n_problems = 1
         self.opt_dt = 0
         self._rollout_list = None
         self._opt_rollouts = None
@@ -83,11 +83,11 @@ class WrapBase(WrapConfig):
             debug_list.append(opt.debug_cost)
         return debug_list
 
-    def update_nenvs(self, n_envs):
-        if n_envs != self.n_envs:
-            self.n_envs = n_envs
+    def update_nproblems(self, n_problems):
+        if n_problems != self.n_problems:
+            self.n_problems = n_problems
             for opt in self.optimizers:
-                opt.update_nenvs(self.n_envs)
+                opt.update_nproblems(self.n_problems)
 
     def update_params(self, goal: Goal):
         with profiler.record_function("wrap_base/safety/update_params"):
@@ -117,6 +117,12 @@ class WrapBase(WrapConfig):
             opt.reset_cuda_graph()
         self._init_solver = False
 
+    def reset_shape(self):
+        self.safety_rollout.reset_shape()
+        for opt in self.optimizers:
+            opt.reset_shape()
+        self._init_solver = False
+
     @property
     def rollout_fn(self):
         return self.safety_rollout
diff --git a/tests/goal_test.py b/tests/goal_test.py
index 788e557..740b9c9 100644
--- a/tests/goal_test.py
+++ b/tests/goal_test.py
@@ -47,6 +47,9 @@ def test_repeat_seeds():
     out_r_v = torch.zeros((b, 4), device=tensor_args.device, dtype=tensor_args.dtype)
     out_idx = torch.zeros((b, 1), device=tensor_args.device, dtype=torch.int32)
     vec_weight = torch.ones((6), device=tensor_args.device, dtype=tensor_args.dtype)
+    offset_waypoint = torch.zeros((6), device=tensor_args.device, dtype=tensor_args.dtype)
+    offset_tstep_fraction = torch.ones((1), device=tensor_args.device, dtype=tensor_args.dtype)
+
     weight = tensor_args.to_device([1, 1, 1, 1])
     vec_convergence = tensor_args.to_device([0, 0])
     run_weight = tensor_args.to_device([1])
@@ -66,6 +69,8 @@ def test_repeat_seeds():
         vec_convergence,
         run_weight,
         vec_weight.clone() * 0.0,
+        offset_waypoint,
+        offset_tstep_fraction,
         g.batch_pose_idx,
         start_pose.position.shape[0],
         1,
@@ -74,6 +79,7 @@ def test_repeat_seeds():
         False,
         False,
         True,
+        True,
     )
 
     assert torch.sum(r[0]).item() == 0.0
@@ -112,6 +118,9 @@ def test_horizon_repeat_seeds():
     out_r_v = torch.zeros((b, h, 4), device=tensor_args.device, dtype=tensor_args.dtype)
     out_idx = torch.zeros((b, h, 1), device=tensor_args.device, dtype=torch.int32)
     vec_weight = torch.ones((6), device=tensor_args.device, dtype=tensor_args.dtype)
+    offset_waypoint = torch.zeros((6), device=tensor_args.device, dtype=tensor_args.dtype)
+    offset_tstep_fraction = torch.ones((1), device=tensor_args.device, dtype=tensor_args.dtype)
+
     weight = tensor_args.to_device([1, 1, 1, 1])
     vec_convergence = tensor_args.to_device([0, 0])
     run_weight = torch.zeros((1, h), device=tensor_args.device)
@@ -132,6 +141,8 @@ def test_horizon_repeat_seeds():
         vec_convergence,
         run_weight,
         vec_weight.clone() * 0.0,
+        offset_waypoint,
+        offset_tstep_fraction,
         g.batch_pose_idx,
         start_pose.position.shape[0],
         h,
@@ -140,5 +151,6 @@ def test_horizon_repeat_seeds():
         True,
         False,
         False,
+        True,
     )
-    assert torch.sum(r[0]).item() == 0.0
+    assert torch.sum(r[0]).item() < 1e-6
diff --git a/tests/ik_config_test.py b/tests/ik_config_test.py
index ba99bb8..2ae1ade 100644
--- a/tests/ik_config_test.py
+++ b/tests/ik_config_test.py
@@ -146,5 +146,5 @@ def test_eval(config, expected):
     result = ik_solver.solve_single(goal)
 
     success = result.success
-    if expected is not -100:
+    if expected != -100:
         assert success.item() == expected
diff --git a/tests/kinematics_test.py b/tests/kinematics_test.py
index dee3f40..14b7386 100644
--- a/tests/kinematics_test.py
+++ b/tests/kinematics_test.py
@@ -169,3 +169,35 @@ def test_locked_joints_kinematics():
     assert torch.linalg.norm(out.ee_position - out_locked.ee_position) < 1e-5
     assert torch.linalg.norm(out.ee_quaternion - out_locked.ee_quaternion) < 1e-5
     assert torch.linalg.norm(out.link_spheres_tensor - out_locked.link_spheres_tensor) < 1e-5
+
+
+def test_franka_toggle_link_collision(cfg):
+    tensor_args = TensorDeviceType()
+
+    robot_model = CudaRobotModel(cfg)
+    q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1)
+    sph_idx = robot_model.kinematics_config.get_sphere_index_from_link_name("panda_link5")
+    state = robot_model.get_state(q_test.clone())
+
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+    # check if all values are equal to position and quaternion
+    link_radius = attached_spheres[..., 3].clone()
+
+    assert torch.count_nonzero(link_radius <= 0.0) == 0
+
+    robot_model.kinematics_config.disable_link_spheres("panda_link5")
+
+    state = robot_model.get_state(q_test.clone())
+
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+
+    sph_radius = attached_spheres[..., 3].clone()
+    assert torch.count_nonzero(sph_radius < 0.0)
+
+    robot_model.kinematics_config.enable_link_spheres("panda_link5")
+
+    state = robot_model.get_state(q_test.clone())
+
+    radius = link_radius - state.link_spheres_tensor[:, sph_idx, 3]
+
+    assert torch.count_nonzero(radius == 0.0)
diff --git a/tests/motion_gen_api_test.py b/tests/motion_gen_api_test.py
index ad10fe3..4c852b3 100644
--- a/tests/motion_gen_api_test.py
+++ b/tests/motion_gen_api_test.py
@@ -14,11 +14,10 @@ import pytest
 import torch
 
 # CuRobo
-from curobo.geom.types import WorldConfig
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
-from curobo.types.robot import JointState, RobotConfig
-from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.types.robot import JointState
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
 from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
 
 
@@ -60,3 +59,49 @@ def test_motion_gen_attach_obstacle_offset(motion_gen):
         start_state, [obstacle], world_objects_pose_offset=offset_pose
     )
     assert True
+
+
+def test_motion_gen_lock_js_update():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka_mobile.yml"
+    robot_config = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_config["kinematics"]["lock_joints"] = {"base_x": 0.0, "base_y": 0.0, "base_z": 0.0}
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_config,
+        world_file,
+        tensor_args,
+        use_cuda_graph=True,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    motion_gen_instance.warmup()
+    retract_cfg = motion_gen_instance.get_retract_config()
+    start_state = JointState.from_position(retract_cfg.view(1, -1))
+
+    kin_state = motion_gen_instance.compute_kinematics(start_state)
+    ee_pose = kin_state.ee_pose.clone()
+
+    # test motion gen:
+    plan_start = start_state.clone()
+    plan_start.position[..., :-2] += 0.1
+    result = motion_gen_instance.plan_single(plan_start, ee_pose)
+
+    assert result.success.item()
+    lock_js = {"base_x": 1.0, "base_y": 0.0, "base_z": 0.0}
+    motion_gen_instance.update_locked_joints(lock_js, robot_config)
+
+    kin_state_new = motion_gen_instance.compute_kinematics(start_state)
+    ee_pose_shift = kin_state_new.ee_pose.clone()
+
+    assert torch.norm(ee_pose.position[..., 0] - ee_pose_shift.position[..., 0]).item() == 1.0
+    assert torch.norm(ee_pose.position[..., 1:] - ee_pose_shift.position[..., 1:]).item() == 0.0
+
+    # test motion gen with new lock state:
+
+    result = motion_gen_instance.plan_single(plan_start, ee_pose_shift)
+    assert result.success.item()
+
+    result = motion_gen_instance.plan_single(
+        plan_start, ee_pose, MotionGenPlanConfig(max_attempts=3)
+    )
+    assert result.success.item() == False
diff --git a/tests/motion_gen_constrained_test.py b/tests/motion_gen_constrained_test.py
new file mode 100644
index 0000000..dbdfa5f
--- /dev/null
+++ b/tests/motion_gen_constrained_test.py
@@ -0,0 +1,314 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState
+from curobo.wrap.reacher.motion_gen import (
+    MotionGen,
+    MotionGenConfig,
+    MotionGenPlanConfig,
+    PoseCostMetric,
+)
+
+
+@pytest.fixture(scope="module")
+def motion_gen(request):
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        use_cuda_graph=True,
+        project_pose_to_goal_frame=request.param,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    motion_gen_instance.warmup(enable_graph=False, warmup_js_trajopt=False)
+    return motion_gen_instance
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (True),
+        (False),
+    ],
+    indirect=True,
+)
+def test_approach_grasp_pose(motion_gen):
+    # run full pose planning
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+    goal_pose.position[0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    m_config = MotionGenPlanConfig(max_attempts=1)
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+    # run grasp pose planning:
+    m_config.pose_cost_metric = PoseCostMetric.create_grasp_approach_metric()
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (True),
+        (False),
+    ],
+    indirect=True,
+)
+def test_reach_only_position(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+    goal_pose.position[0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+    m_config = MotionGenPlanConfig(
+        max_attempts=1,
+        pose_cost_metric=PoseCostMetric(
+            reach_partial_pose=True,
+            reach_vec_weight=motion_gen.tensor_args.to_device([0, 0, 0, 1, 1, 1]),
+        ),
+    )
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+    reached_state = result.optimized_plan[-1]
+    reached_pose = motion_gen.compute_kinematics(reached_state).ee_pose.clone()
+    assert goal_pose.angular_distance(reached_pose) > 0.0
+    assert goal_pose.linear_distance(reached_pose) <= 0.005
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (True),
+        (False),
+    ],
+    indirect=True,
+)
+def test_reach_only_orientation(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+    goal_pose.position[0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+    m_config = MotionGenPlanConfig(
+        max_attempts=1,
+        pose_cost_metric=PoseCostMetric(
+            reach_partial_pose=True,
+            reach_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 0, 0, 0]),
+        ),
+    )
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+    reached_state = result.optimized_plan[-1]
+    reached_pose = motion_gen.compute_kinematics(reached_state).ee_pose.clone()
+    assert goal_pose.linear_distance(reached_pose) > 0.0
+    assert goal_pose.angular_distance(reached_pose) < 0.05
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (True),
+        (False),
+    ],
+    indirect=True,
+)
+def test_hold_orientation(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+    goal_pose.position[0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    start_pose = motion_gen.compute_kinematics(start_state).ee_pose.clone()
+    goal_pose.quaternion = start_pose.quaternion.clone()
+    m_config = MotionGenPlanConfig(
+        max_attempts=1,
+        pose_cost_metric=PoseCostMetric(
+            hold_partial_pose=True,
+            hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 0, 0, 0]),
+        ),
+    )
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+    traj_pose = motion_gen.compute_kinematics(result.optimized_plan).ee_pose.clone()
+
+    # assert goal_pose.linear_distance(traj_pose) > 0.0
+    goal_pose = goal_pose.repeat(traj_pose.shape[0])
+    assert torch.max(goal_pose.angular_distance(traj_pose)) < 0.05
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (True),
+        (False),
+    ],
+    indirect=True,
+)
+def test_hold_position(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+    goal_pose.position[0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    start_pose = motion_gen.compute_kinematics(start_state).ee_pose.clone()
+    goal_pose.position = start_pose.position.clone()
+    m_config = MotionGenPlanConfig(
+        max_attempts=1,
+        pose_cost_metric=PoseCostMetric(
+            hold_partial_pose=True,
+            hold_vec_weight=motion_gen.tensor_args.to_device([0, 0, 0, 1, 1, 1]),
+        ),
+    )
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+    traj_pose = motion_gen.compute_kinematics(result.optimized_plan).ee_pose.clone()
+
+    goal_pose = goal_pose.repeat(traj_pose.shape[0])
+    assert torch.max(goal_pose.linear_distance(traj_pose)) < 0.005
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (False),
+        (True),
+    ],
+    indirect=True,
+)
+def test_hold_partial_pose(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    start_pose = motion_gen.compute_kinematics(start_state).ee_pose.clone()
+    goal_pose.position = start_pose.position.clone()
+    goal_pose.quaternion = start_pose.quaternion.clone()
+
+    if motion_gen.project_pose_to_goal_frame:
+        offset_pose = Pose.from_list([0, 0.1, 0, 1, 0, 0, 0])
+        goal_pose = goal_pose.multiply(offset_pose)
+    else:
+        goal_pose.position[0, 1] += 0.2
+
+    m_config = MotionGenPlanConfig(
+        max_attempts=1,
+        pose_cost_metric=PoseCostMetric(
+            hold_partial_pose=True,
+            hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 1, 0, 1]),
+        ),
+    )
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+    assert torch.count_nonzero(result.success) == 1
+
+    traj_pose = motion_gen.compute_kinematics(result.optimized_plan).ee_pose.clone()
+
+    goal_pose = goal_pose.repeat(traj_pose.shape[0])
+    if motion_gen.project_pose_to_goal_frame:
+        traj_pose = goal_pose.compute_local_pose(traj_pose)
+        traj_pose.position[:, 1] = 0.0
+        assert torch.max(traj_pose.position) < 0.005
+
+    else:
+        goal_pose.position[:, 1] = 0.0
+        traj_pose.position[:, 1] = 0.0
+        assert torch.max(goal_pose.linear_distance(traj_pose)) < 0.005
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (False),
+        (True),
+    ],
+    indirect=True,
+)
+def test_hold_partial_pose_fail(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = state.ee_pose.clone()
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    start_pose = motion_gen.compute_kinematics(start_state).ee_pose.clone()
+    goal_pose.position = start_pose.position.clone()
+    goal_pose.quaternion = start_pose.quaternion.clone()
+
+    if motion_gen.project_pose_to_goal_frame:
+        offset_pose = Pose.from_list([0, 0.1, 0.1, 1, 0, 0, 0])
+        goal_pose = goal_pose.multiply(offset_pose)
+    else:
+        goal_pose.position[0, 1] += 0.2
+        goal_pose.position[0, 0] += 0.2
+
+    m_config = MotionGenPlanConfig(
+        max_attempts=1,
+        pose_cost_metric=PoseCostMetric(
+            hold_partial_pose=True,
+            hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 1, 0, 1]),
+        ),
+    )
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+    assert torch.count_nonzero(result.success) == 0
+    assert result.valid_query == False
diff --git a/tests/motion_gen_goalset_test.py b/tests/motion_gen_goalset_test.py
new file mode 100644
index 0000000..74f2ef2
--- /dev/null
+++ b/tests/motion_gen_goalset_test.py
@@ -0,0 +1,201 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+@pytest.fixture(scope="function")
+def motion_gen():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        use_cuda_graph=True,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    return motion_gen_instance
+
+
+@pytest.fixture(scope="function")
+def motion_gen_batch():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        use_cuda_graph=True,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    return motion_gen_instance
+
+
+def test_goalset_padded(motion_gen):
+    # run goalset planning
+    motion_gen.warmup(n_goalset=10)
+    motion_gen.reset()
+    m_config = MotionGenPlanConfig(False, True)
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(10, 1).view(1, -1, 3),
+        quaternion=state.ee_quat_seq.repeat(10, 1).view(1, -1, 4),
+    )
+    goal_pose.position[0, 0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    result = motion_gen.plan_goalset(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(2, 1).view(1, -1, 3),
+        quaternion=state.ee_quat_seq.repeat(2, 1).view(1, -1, 4),
+    )
+    goal_pose.position[0, 0, 0] -= 0.1
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    result = motion_gen.plan_goalset(start_state, goal_pose, m_config.clone())
+
+    # run goalset with less goals:
+
+    # run single planning
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config.clone())
+
+    assert torch.count_nonzero(result.success) == 1
+
+
+def test_batch_goalset_padded(motion_gen_batch):
+    motion_gen = motion_gen_batch
+    motion_gen.warmup(n_goalset=10, batch=3, enable_graph=False)
+    # run goalset planning
+    motion_gen.reset()
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(3 * 3, 1).view(3, -1, 3).contiguous(),
+        quaternion=state.ee_quat_seq.repeat(3 * 3, 1).view(3, -1, 4).contiguous(),
+    )
+    goal_pose.position[0, 1, 1] = 0.2
+    goal_pose.position[1, 0, 1] = 0.2
+    goal_pose.position[2, 1, 1] = 0.2
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.2).repeat_seeds(3)
+
+    m_config = MotionGenPlanConfig(False, True, max_attempts=10, enable_graph_attempt=20)
+    result = motion_gen.plan_batch_goalset(start_state, goal_pose, m_config.clone())
+
+    # get final solutions:
+    assert torch.count_nonzero(result.success) == result.success.shape[0]
+
+    reached_state = motion_gen.compute_kinematics(result.optimized_plan.trim_trajectory(-1))
+
+    #
+    goal_position = torch.cat(
+        [
+            goal_pose.position[x, result.goalset_index[x], :].unsqueeze(0)
+            for x in range(len(result.goalset_index))
+        ]
+    )
+
+    assert result.goalset_index is not None
+
+    assert torch.max(torch.norm(goal_position - reached_state.ee_pos_seq, dim=-1)) < 0.005
+
+    # run goalset with less goals:
+
+    motion_gen.reset()
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(6, 1).view(3, -1, 3).contiguous(),
+        quaternion=state.ee_quat_seq.repeat(6, 1).view(3, -1, 4).contiguous(),
+    )
+    goal_pose.position[0, 1, 1] = 0.2
+    goal_pose.position[1, 0, 1] = 0.2
+    goal_pose.position[2, 1, 1] = 0.2
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.2).repeat_seeds(3)
+
+    result = motion_gen.plan_batch_goalset(start_state, goal_pose, m_config)
+
+    # get final solutions:
+    assert torch.count_nonzero(result.success) == result.success.shape[0]
+
+    reached_state = motion_gen.compute_kinematics(result.optimized_plan.trim_trajectory(-1))
+
+    #
+    goal_position = torch.cat(
+        [
+            goal_pose.position[x, result.goalset_index[x], :].unsqueeze(0)
+            for x in range(len(result.goalset_index))
+        ]
+    )
+
+    assert result.goalset_index is not None
+
+    assert torch.max(torch.norm(goal_position - reached_state.ee_pos_seq, dim=-1)) < 0.005
+    # run single planning
+
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze()
+    ).repeat_seeds(3)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(3)
+
+    goal_pose.position[1, 0] -= 0.1
+
+    result = motion_gen.plan_batch(start_state, goal_pose.clone(), m_config)
+    assert torch.count_nonzero(result.success) == 3
+
+    # get final solutions:
+    q = result.optimized_plan.trim_trajectory(-1).squeeze(1)
+    reached_state = motion_gen.compute_kinematics(q)
+    assert torch.norm(goal_pose.position - reached_state.ee_pos_seq) < 0.005
diff --git a/tests/motion_gen_module_test.py b/tests/motion_gen_module_test.py
index 805b509..de8e467 100644
--- a/tests/motion_gen_module_test.py
+++ b/tests/motion_gen_module_test.py
@@ -23,7 +23,7 @@ from curobo.util_file import get_robot_configs_path, get_world_configs_path, joi
 from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def motion_gen():
     tensor_args = TensorDeviceType()
     world_file = "collision_table.yml"
@@ -38,7 +38,7 @@ def motion_gen():
     return motion_gen_instance
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def motion_gen_batch_env():
     tensor_args = TensorDeviceType()
     world_files = ["collision_table.yml", "collision_test.yml"]
@@ -101,6 +101,7 @@ def test_motion_gen_goalset(motion_gen):
         state.ee_pos_seq.repeat(2, 1).view(1, -1, 3),
         quaternion=state.ee_quat_seq.repeat(2, 1).view(1, -1, 4),
     )
+    goal_pose.position[0, 0, 0] -= 0.1
 
     start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
 
@@ -121,6 +122,13 @@ def test_motion_gen_goalset(motion_gen):
         < 0.005
     )
 
+    assert result.goalset_index is not None
+
+    assert (
+        torch.norm(goal_pose.position[:, result.goalset_index, :] - reached_state.ee_pos_seq)
+        < 0.005
+    )
+
 
 def test_motion_gen_batch_goalset(motion_gen):
     motion_gen.reset()
@@ -130,29 +138,36 @@ def test_motion_gen_batch_goalset(motion_gen):
     state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
 
     goal_pose = Pose(
-        state.ee_pos_seq.repeat(4, 1).view(2, -1, 3),
-        quaternion=state.ee_quat_seq.repeat(4, 1).view(2, -1, 4),
+        state.ee_pos_seq.repeat(6, 1).view(3, -1, 3).clone(),
+        quaternion=state.ee_quat_seq.repeat(6, 1).view(3, -1, 4).clone(),
     )
+    goal_pose.position[0, 1, 1] = 0.2
+    goal_pose.position[1, 0, 1] = 0.2
+    goal_pose.position[2, 1, 1] = 0.2
 
-    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(2)
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.2).repeat_seeds(3)
 
-    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10, max_attempts=1)
 
     result = motion_gen.plan_batch_goalset(start_state, goal_pose, m_config)
 
     # get final solutions:
-    assert torch.count_nonzero(result.success) > 0
+    assert torch.count_nonzero(result.success) == result.success.shape[0]
 
     reached_state = motion_gen.compute_kinematics(result.optimized_plan.trim_trajectory(-1))
 
-    assert (
-        torch.min(
-            torch.norm(goal_pose.position[:, 0, :] - reached_state.ee_pos_seq),
-            torch.norm(goal_pose.position[:, 1, :] - reached_state.ee_pos_seq),
-        )
-        < 0.005
+    #
+    goal_position = torch.cat(
+        [
+            goal_pose.position[x, result.goalset_index[x], :].unsqueeze(0)
+            for x in range(len(result.goalset_index))
+        ]
     )
 
+    assert result.goalset_index is not None
+
+    assert torch.max(torch.norm(goal_position - reached_state.ee_pos_seq, dim=-1)) < 0.005
+
 
 def test_motion_gen_batch(motion_gen):
     motion_gen.reset()
@@ -188,7 +203,6 @@ def test_motion_gen_batch(motion_gen):
     ],
 )
 def test_motion_gen_batch_graph(motion_gen_str: str, interpolation: InterpolateType, request):
-    # return
     motion_gen = request.getfixturevalue(motion_gen_str)
 
     motion_gen.graph_planner.interpolation_type = interpolation
@@ -275,6 +289,17 @@ def test_motion_gen_batch_env_goalset(motion_gen_batch_env):
         < 0.005
     )
 
+    goal_position = torch.cat(
+        [
+            goal_pose.position[x, result.goalset_index[x], :].unsqueeze(0)
+            for x in range(len(result.goalset_index))
+        ]
+    )
+
+    assert result.goalset_index is not None
+
+    assert torch.max(torch.norm(goal_position - reached_state.ee_pos_seq, dim=-1)) < 0.005
+
 
 @pytest.mark.parametrize(
     "motion_gen_str,enable_graph",
diff --git a/tests/motion_gen_speed_test.py b/tests/motion_gen_speed_test.py
new file mode 100644
index 0000000..9a208ef
--- /dev/null
+++ b/tests/motion_gen_speed_test.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.trajectory import InterpolateType
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+@pytest.fixture(scope="function")
+def motion_gen(request):
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        velocity_scale=request.param,
+        interpolation_steps=10000,
+        interpolation_dt=0.02,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    return motion_gen_instance
+
+
+@pytest.mark.parametrize(
+    "motion_gen",
+    [
+        (1.0),
+        (0.75),
+        (0.5),
+        (0.25),
+        (0.15),
+        (0.1),
+    ],
+    indirect=True,
+)
+def test_motion_gen_velocity_scale(motion_gen):
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    m_config = MotionGenPlanConfig(False, True, max_attempts=10)
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config)
+
+    assert torch.count_nonzero(result.success) == 1
diff --git a/tests/mpc_test.py b/tests/mpc_test.py
index 7188b00..0867dcf 100644
--- a/tests/mpc_test.py
+++ b/tests/mpc_test.py
@@ -130,7 +130,7 @@ def test_mpc_single(mpc_str, expected, request):
         if result.metrics.pose_error.item() < 0.05:
             converged = True
             break
-        if tstep > 100:
+        if tstep > 200:
             break
     assert converged == expected
 
diff --git a/tests/robot_world_model_test.py b/tests/robot_world_model_test.py
index 6e4be11..30cd586 100644
--- a/tests/robot_world_model_test.py
+++ b/tests/robot_world_model_test.py
@@ -85,7 +85,7 @@ def test_robot_world_collision_vector():
     d_world, d_vec = model.get_collision_vector(state.link_spheres_tensor.view(n, 1, -1, 4))
 
     assert d_world.shape[0] == n
-    assert torch.norm(d_world[0] - 0.1385) < 0.002
+    assert torch.norm(d_world[0] - 0.1385) < 0.005
     assert torch.abs(d_vec[0, 0, 0, 2] + 0.7350) > 0.002