Bug Fixes, Add Isaac Sim 4.5 support for examples

2025-04-25 11:24:16 -07:00
parent 2fbffc3522
commit 0a50de1ba7
43 changed files with 728 additions and 193 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,20 @@ its affiliates is strictly prohibited.
 -->
 # Changelog

+## Version 0.7.7
+
+### New Features
+- Add cpu support for types.math.Pose
+- Add cdist fallback for robot segmentation when triton is not available. Requires half the memory.
+
+### BugFixes & Misc.
+- Fix bug in LBFGS where buffers were not reinitialized upon change in history.
+- Isaac Sim 4.5 support for examples. All but one example works now. ``load_all_robots.py`` does
+not work correctly.
+- Fix bug in jerk gradient calculation. Improves convergence in trajectory optimization. Multi-arm
+reacher is slightly improved (see isaac sim example).
+
+
 ## Version 0.7.6

 ### Changes in Default Behavior
--- a/docker/start_dev_docker.sh
+++ b/docker/start_dev_docker.sh
@@ -41,6 +41,7 @@ if [ $input_arg == "x86" ]; then
    --env DISPLAY=unix$DISPLAY \
    --volume /tmp/.X11-unix:/tmp/.X11-unix \
    --volume /dev:/dev \
+    --ipc=host \
    curobo_docker:user_$input_arg

 elif [ $input_arg == "aarch64" ]; then
--- a/examples/collision_check_example.py
+++ b/examples/collision_check_example.py
@@ -8,9 +8,7 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
-""" Example computing collisions using curobo
-
-"""
+"""Example computing collisions using curobo"""
 # Third Party
 import torch

--- a/examples/isaac_sim/batch_collision_checker.py
+++ b/examples/isaac_sim/batch_collision_checker.py
@@ -51,7 +51,13 @@ import carb
 import numpy as np
 from helper import add_extensions
 from omni.isaac.core import World
+
+try:
    from omni.isaac.core.materials import OmniPBR
+except ImportError:
+    from isaacsim.core.api.materials import OmniPBR
+
+
 from omni.isaac.core.objects import sphere

 # CuRobo
--- a/examples/isaac_sim/batch_motion_gen_reacher.py
+++ b/examples/isaac_sim/batch_motion_gen_reacher.py
@@ -126,9 +126,11 @@ def main():
            "/World/world_" + str(i) + "/",
            robot_name="robot_" + str(i),
            position=pose.position[0].cpu().numpy(),
+            initialize_world=False,
        )
        robot_list.append(r[0])
    setup_curobo_logger("warn")
+    my_world.initialize_physics()

    # warmup curobo instance

@@ -195,9 +197,10 @@ def main():
            continue
        step_index = my_world.current_time_step_index

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
            for robot in robot_list:
+                robot._articulation_view.initialize()
                idx_list = [robot.get_dof_index(x) for x in j_names]
                robot.set_joint_positions(default_config, idx_list)

@@ -223,6 +226,8 @@ def main():
            past_goal = ik_goal.clone()
        sim_js_names = robot_list[0].dof_names
        sim_js = robot_list[0].get_joints_state()
+        if sim_js is None:
+            continue
        full_js = JointState(
            position=tensor_args.to_device(sim_js.positions).view(1, -1),
            velocity=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
--- a/examples/isaac_sim/collision_checker.py
+++ b/examples/isaac_sim/collision_checker.py
@@ -54,7 +54,11 @@ import carb
 import numpy as np
 from helper import add_extensions
 from omni.isaac.core import World
+
+try:
    from omni.isaac.core.materials import OmniPBR
+except ImportError:
+    from isaacsim.core.api.materials import OmniPBR
 from omni.isaac.core.objects import sphere

 # CuRobo
@@ -73,7 +77,10 @@ from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig

 def draw_line(start, gradient):
    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -194,7 +201,10 @@ def main():

        else:
            # Third Party
+            try:
                from omni.isaac.debug_draw import _debug_draw
+            except ImportError:
+                from isaacsim.util.debug_draw import _debug_draw

            draw = _debug_draw.acquire_debug_draw_interface()
            # if draw.get_num_points() > 0:
--- a/examples/isaac_sim/constrained_reacher.py
+++ b/examples/isaac_sim/constrained_reacher.py
@@ -58,7 +58,11 @@ import argparse
 # Third Party
 import carb
 from omni.isaac.core import World
+
+try:
    from omni.isaac.core.materials import OmniGlass, OmniPBR
+except ImportError:
+    from isaacsim.core.api.materials import OmniGlass, OmniPBR
 from omni.isaac.core.objects import cuboid, sphere
 from omni.isaac.core.utils.types import ArticulationAction

@@ -207,8 +211,9 @@ if __name__ == "__main__":
            continue
        step_index = my_world.current_time_step_index

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

@@ -218,7 +223,7 @@ if __name__ == "__main__":

        if False and step_index % 50 == 0.0:  # No obstacle update
            obstacles = usd_help.get_obstacles_from_stage(
-                # only_paths=[obstacles_path],
+                only_paths=["/World"],
                reference_prim_path=robot_prim_path,
                ignore_substring=[
                    robot_prim_path,
--- a/examples/isaac_sim/helper.py
+++ b/examples/isaac_sim/helper.py
@@ -16,7 +16,6 @@ from typing import Dict, List
 import numpy as np
 from matplotlib import cm
 from omni.isaac.core import World
-from omni.isaac.core.materials import OmniPBR
 from omni.isaac.core.objects import cuboid
 from omni.isaac.core.robots import Robot
 from pxr import UsdPhysics
@@ -26,14 +25,28 @@ from curobo.util.logger import log_warn
 from curobo.util.usd_helper import set_prim_transform

 ISAAC_SIM_23 = False
+ISAAC_SIM_45 = False
 try:
    # Third Party
    from omni.isaac.urdf import _urdf  # isaacsim 2022.2
 except ImportError:
    # Third Party
+    try:
        from omni.importer.urdf import _urdf  # isaac sim 2023.1 or above
+    except ImportError:
+        from isaacsim.asset.importer.urdf import _urdf  # isaac sim 4.5+

+        ISAAC_SIM_45 = True
    ISAAC_SIM_23 = True
+
+try:
+    # import for older isaacsim installations
+    from omni.isaac.core.materials import OmniPBR
+except ImportError:
+    # import for isaac sim 4.5+
+    from isaacsim.core.api.materials import OmniPBR
+
+
 # Standard Library
 from typing import Optional

@@ -67,6 +80,7 @@ def add_robot_to_scene(
    subroot: str = "",
    robot_name: str = "robot",
    position: np.array = np.array([0, 0, 0]),
+    initialize_world: bool = True,
 ):

    urdf_interface = _urdf.acquire_urdf_interface()
@@ -91,11 +105,50 @@ def add_robot_to_scene(
        and robot_config["kinematics"]["external_asset_path"] is not None
    ):
        asset_path = robot_config["kinematics"]["external_asset_path"]
+
+    # urdf_path:
+    # meshes_path:
+    # meshes path should be a subset of urdf_path
    full_path = join_path(asset_path, robot_config["kinematics"]["urdf_path"])
+    # full path contains the path to urdf
+    # Get meshes path
    robot_path = get_path_of_dir(full_path)
    filename = get_filename(full_path)
+    if ISAAC_SIM_45:
+        from isaacsim.core.utils.extensions import get_extension_path_from_name
+        import omni.kit.commands
+        import omni.usd
+
+        # Retrieve the path of the URDF file from the extension
+        extension_path = get_extension_path_from_name("isaacsim.asset.importer.urdf")
+        root_path = robot_path
+        file_name = filename
+
+        # Parse the robot's URDF file to generate a robot model
+
+        dest_path = join_path(
+            root_path, get_filename(file_name, remove_extension=True) + "_temp.usd"
+        )
+
+        result, robot_path = omni.kit.commands.execute(
+            "URDFParseAndImportFile",
+            urdf_path="{}/{}".format(root_path, file_name),
+            import_config=import_config,
+            dest_path=dest_path,
+        )
+        prim_path = omni.usd.get_stage_next_free_path(
+            my_world.scene.stage,
+            str(my_world.scene.stage.GetDefaultPrim().GetPath()) + robot_path,
+            False,
+        )
+        robot_prim = my_world.scene.stage.OverridePrim(prim_path)
+        robot_prim.GetReferences().AddReference(dest_path)
+        robot_path = prim_path
+    else:
+
        imported_robot = urdf_interface.parse_urdf(robot_path, filename, import_config)
        dest_path = subroot
+
        robot_path = urdf_interface.import_robot(
            robot_path,
            filename,
@@ -117,6 +170,11 @@ def add_robot_to_scene(
    set_prim_transform(linkp, [position[0], position[1], position[2], 1, 0, 0, 0])

    robot = my_world.scene.add(robot_p)
+    if initialize_world:
+        if ISAAC_SIM_45:
+            my_world.initialize_physics()
+            robot.initialize()
+
    return robot, robot_path


--- a/examples/isaac_sim/ik_reachability.py
+++ b/examples/isaac_sim/ik_reachability.py
@@ -112,7 +112,10 @@ def get_pose_grid(n_x, n_y, n_z, max_x, max_y, max_z):

 def draw_points(pose, success):
    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    N = 100
@@ -236,9 +239,8 @@ def main():
            continue

        step_index = my_world.current_time_step_index
-        # print(step_index)
-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

@@ -251,7 +253,7 @@ def main():
        if step_index == 50 or step_index % 500 == 0.0:  # and cmd_plan is None:
            print("Updating world, reading w.r.t.", robot_prim_path)
            obstacles = usd_help.get_obstacles_from_stage(
-                # only_paths=[obstacles_path],
+                only_paths=["/World"],
                reference_prim_path=robot_prim_path,
                ignore_substring=[
                    robot_prim_path,
--- a/examples/isaac_sim/load_all_robots.py
+++ b/examples/isaac_sim/load_all_robots.py
@@ -68,13 +68,17 @@ from curobo.types.math import Pose
 from curobo.types.robot import RobotConfig

 ########### OV #################
-from curobo.util.logger import setup_curobo_logger
+from curobo.util.logger import setup_curobo_logger, log_warn
 from curobo.util.usd_helper import UsdHelper
 from curobo.util_file import get_motion_gen_robot_list, get_robot_configs_path, join_path, load_yaml


 def main():
+    log_warn("This example will not work correctly in isaac sim 4.5+")
    list_of_robots = get_motion_gen_robot_list()  # [:2]
+    # remove tm12 as meshes don't load correctly in isaac sim 4.5
+    if "tm12.yml" in list_of_robots:
+        list_of_robots.remove("tm12.yml")
    usd_help = UsdHelper()

    # assuming obstacles are in objects_path:
@@ -107,12 +111,14 @@ def main():
            pose.position[0, 1] = 0
        robot_cfg = load_yaml(join_path(get_robot_configs_path(), list_of_robots[i]))["robot_cfg"]
        robot_cfg_list.append(robot_cfg)
+        print("Loading robot: ", list_of_robots[i])
        r = add_robot_to_scene(
            robot_cfg,
            my_world,
            "/World/world_" + str(i) + "/",
            robot_name="/World/world_" + str(i) + "/" "robot_" + str(i),
            position=pose.position[0].cpu().numpy(),
+            initialize_world=False,
        )

        robot_list.append(r[0])
@@ -135,6 +141,9 @@ def main():

    setup_curobo_logger("warn")

+    my_world.initialize_physics()
+    my_world.reset()
+
    add_extensions(simulation_app, args.headless_mode)
    while simulation_app.is_running():
        my_world.step(render=True)
@@ -146,9 +155,9 @@ def main():
            continue
        step_index = my_world.current_time_step_index

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
            for ri, robot in enumerate(robot_list):
+                robot._articulation_view.initialize()
                j_names = robot_cfg_list[ri]["kinematics"]["cspace"]["joint_names"]
                default_config = robot_cfg_list[ri]["kinematics"]["cspace"]["retract_config"]

--- a/examples/isaac_sim/motion_gen_reacher.py
+++ b/examples/isaac_sim/motion_gen_reacher.py
@@ -272,12 +272,9 @@ def main():
            continue

        step_index = my_world.current_time_step_index
-        # print(step_index)
        if articulation_controller is None:
-            # robot.initialize()
            articulation_controller = robot.get_articulation_controller()
-        if step_index < 2:
-            my_world.reset()
+        if step_index < 10:
            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)
@@ -291,7 +288,7 @@ def main():
        if step_index == 50 or step_index % 1000 == 0.0:
            print("Updating world, reading w.r.t.", robot_prim_path)
            obstacles = usd_help.get_obstacles_from_stage(
-                # only_paths=[obstacles_path],
+                only_paths=["/World"],
                reference_prim_path=robot_prim_path,
                ignore_substring=[
                    robot_prim_path,
@@ -319,6 +316,9 @@ def main():
            past_orientation = cube_orientation

        sim_js = robot.get_joints_state()
+        if sim_js is None:
+            print("sim_js is None")
+            continue
        sim_js_names = robot.dof_names
        if np.any(np.isnan(sim_js.positions)):
            log_error("isaac sim has returned NAN joint position values.")
@@ -362,7 +362,7 @@ def main():
                        spheres[si].set_radius(float(s.radius))

        robot_static = False
-        if (np.max(np.abs(sim_js.velocities)) < 0.2) or args.reactive:
+        if (np.max(np.abs(sim_js.velocities)) < 0.5) or args.reactive:
            robot_static = True
        if (
            (
--- a/examples/isaac_sim/motion_gen_reacher_nvblox.py
+++ b/examples/isaac_sim/motion_gen_reacher_nvblox.py
@@ -188,8 +188,9 @@ def main():

        step_index = my_world.current_time_step_index
        # print(step_index)
-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

@@ -207,6 +208,9 @@ def main():
        if target_pose is None:
            target_pose = cube_position
        sim_js = robot.get_joints_state()
+        if sim_js is None:
+            print("sim_js is None")
+            continue
        sim_js_names = robot.dof_names
        cu_js = JointState(
            position=tensor_args.to_device(sim_js.positions),
--- a/examples/isaac_sim/mpc_example.py
+++ b/examples/isaac_sim/mpc_example.py
@@ -116,8 +116,10 @@ def draw_points(rollouts: torch.Tensor):
    import random

    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
-
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw
    draw = _debug_draw.acquire_debug_draw_interface()
    N = 100
    # if draw.get_num_points() > 0:
@@ -265,8 +267,10 @@ def main():

        step_index = my_world.current_time_step_index

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
+
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

@@ -281,7 +285,7 @@ def main():
        if step_index % 1000 == 0:
            print("Updating world")
            obstacles = usd_help.get_obstacles_from_stage(
-                # only_paths=[obstacles_path],
+                only_paths=["/World"],
                ignore_substring=[
                    robot_prim_path,
                    "/World/target",
@@ -315,6 +319,9 @@ def main():

        # get robot current state:
        sim_js = robot.get_joints_state()
+        if sim_js is None:
+            print("sim_js is None")
+            continue
        js_names = robot.dof_names
        sim_js_names = robot.dof_names

@@ -354,7 +361,7 @@ def main():
        cmd_state_full = cmd_state

        art_action = ArticulationAction(
-            cmd_state.position.cpu().numpy(),
+            cmd_state.position.view(-1).cpu().numpy(),
            # cmd_state.velocity.cpu().numpy(),
            joint_indices=idx_list,
        )
@@ -364,7 +371,7 @@ def main():

        if succ:
            # set desired joint angles obtained from IK:
-            for _ in range(3):
+            for _ in range(1):
                articulation_controller.apply_action(art_action)

        else:
--- a/examples/isaac_sim/mpc_nvblox_example.py
+++ b/examples/isaac_sim/mpc_nvblox_example.py
@@ -104,7 +104,10 @@ def draw_points(rollouts: torch.Tensor):
    import random

    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    N = 100
@@ -242,8 +245,9 @@ def main():

        step_index = my_world.current_time_step_index

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

@@ -278,6 +282,9 @@ def main():

        # get robot current state:
        sim_js = robot.get_joints_state()
+        if sim_js is None:
+            print("sim_js is None")
+            continue
        js_names = robot.dof_names
        sim_js_names = robot.dof_names

--- a/examples/isaac_sim/multi_arm_reacher.py
+++ b/examples/isaac_sim/multi_arm_reacher.py
@@ -149,6 +149,7 @@ def main():
        collision_activation_distance=0.025,
        fixed_iters_trajopt=True,
        maximum_trajectory_dt=0.5,
+        ik_opt_iters=500,
    )
    motion_gen = MotionGen(motion_gen_config)
    print("warming up...")
@@ -221,8 +222,9 @@ def main():

        step_index = my_world.current_time_step_index
        # print(step_index)
-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

@@ -235,7 +237,7 @@ def main():
        if step_index == 50 or step_index % 1000 == 0.0:
            print("Updating world, reading w.r.t.", robot_prim_path)
            obstacles = usd_help.get_obstacles_from_stage(
-                # only_paths=[obstacles_path],
+                only_paths=["/World"],
                reference_prim_path=robot_prim_path,
                ignore_substring=[
                    robot_prim_path,
@@ -258,6 +260,9 @@ def main():
        if target_pose is None:
            target_pose = cube_position
        sim_js = robot.get_joints_state()
+        if sim_js is None:
+            print("sim_js is None")
+            continue
        sim_js_names = robot.dof_names
        cu_js = JointState(
            position=tensor_args.to_device(sim_js.positions),
@@ -287,11 +292,10 @@ def main():
                for si, s in enumerate(sph_list[0]):
                    spheres[si].set_world_pose(position=np.ravel(s.position))
                    spheres[si].set_radius(float(s.radius))
-        # print(sim_js.velocities)
        if (
            np.linalg.norm(cube_position - target_pose) > 1e-3
            and np.linalg.norm(past_pose - cube_position) == 0.0
-            and np.max(np.abs(sim_js.velocities)) < 0.2
+            and np.max(np.abs(sim_js.velocities)) < 0.5
        ):
            # Set EE teleop goals, use cube for simple non-vr init:
            ee_translation_goal = cube_position
--- a/examples/isaac_sim/realsense_collision.py
+++ b/examples/isaac_sim/realsense_collision.py
@@ -68,7 +68,10 @@ def draw_points(voxels):
    # Third Party

    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -125,7 +128,10 @@ def clip_camera(camera_data):

 def draw_line(start, gradient):
    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -270,7 +276,10 @@ if __name__ == "__main__":
            draw_line(sph_position, d_vec[..., :3].view(3).cpu().numpy())
        else:
            # Third Party
+            try:
                from omni.isaac.debug_draw import _debug_draw
+            except ImportError:
+                from isaacsim.util.debug_draw import _debug_draw

            draw = _debug_draw.acquire_debug_draw_interface()
            # if draw.get_num_points() > 0:
--- a/examples/isaac_sim/realsense_mpc.py
+++ b/examples/isaac_sim/realsense_mpc.py
@@ -98,7 +98,10 @@ def draw_rollout_points(rollouts: torch.Tensor, clear: bool = False):
    import random

    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    N = 100
@@ -124,7 +127,10 @@ def draw_points(voxels):
    # Third Party

    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -171,7 +177,10 @@ def clip_camera(camera_data):

 def draw_line(start, gradient):
    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -345,8 +354,9 @@ if __name__ == "__main__":
        if cmd_step_idx == 0:
            draw_rollout_points(mpc.get_visual_rollouts(), clear=not args.use_debug_draw)

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

--- a/examples/isaac_sim/realsense_reacher.py
+++ b/examples/isaac_sim/realsense_reacher.py
@@ -88,7 +88,10 @@ def draw_points(voxels):
    # Third Party

    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -135,7 +138,10 @@ def clip_camera(camera_data):

 def draw_line(start, gradient):
    # Third Party
+    try:
        from omni.isaac.debug_draw import _debug_draw
+    except ImportError:
+        from isaacsim.util.debug_draw import _debug_draw

    draw = _debug_draw.acquire_debug_draw_interface()
    # if draw.get_num_points() > 0:
@@ -283,8 +289,9 @@ if __name__ == "__main__":
            continue
        step_index = my_world.current_time_step_index

-        if step_index <= 2:
-            my_world.reset()
+        if step_index <= 10:
+            # my_world.reset()
+            robot._articulation_view.initialize()
            idx_list = [robot.get_dof_index(x) for x in j_names]
            robot.set_joint_positions(default_config, idx_list)

--- a/examples/kinematics_example.py
+++ b/examples/kinematics_example.py
@@ -8,9 +8,7 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
-""" Example computing fk using curobo
-
-"""
+"""Example computing fk using curobo"""
 # Third Party
 import torch

--- a/examples/mesh_dataset.py
+++ b/examples/mesh_dataset.py
@@ -0,0 +1,224 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+import math
+import os
+import sys
+from typing import Optional
+
+import numpy as np
+import pyrender
+import torch
+import torch.nn.functional as F
+import trimesh
+from torch.utils.data.dataset import Dataset
+
+
+def fov_and_size_to_intrinsics(fov, img_size, device="cpu"):
+    img_h, img_w = img_size
+    fx = img_w / (2 * math.tan(math.radians(fov) / 2))
+    fy = img_h / (2 * math.tan(math.radians(fov) / 2))
+
+    intrinsics = torch.tensor(
+        [[fx, 0, img_h / 2], [0, fy, img_w / 2], [0, 0, 1]],
+        dtype=torch.float,
+        device=device,
+    )
+    return intrinsics
+
+
+def lookat_to_cam_pose(eyes, ats, ups=[[0, 0, 1]], device="cpu", mode="opengl"):
+    if not isinstance(eyes, torch.Tensor):
+        eyes = torch.tensor(eyes, device=device, dtype=torch.float32)
+    if not isinstance(ats, torch.Tensor):
+        ats = torch.tensor(ats, device=device, dtype=torch.float32)
+    if not isinstance(ups, torch.Tensor):
+        ups = torch.tensor(ups, device=device, dtype=torch.float32)
+
+    batch_size = eyes.shape[0]
+
+    camera_view = F.normalize(ats - eyes, dim=1)
+    camera_right = F.normalize(torch.cross(camera_view, ups, dim=1), dim=1)
+    camera_up = F.normalize(torch.cross(camera_right, camera_view, dim=1), dim=1)
+
+    # rotation matrix from opencv conventions
+    T = torch.zeros((batch_size, 4, 4))
+    if mode == "opengl":
+        T[:, :3, :] = torch.stack([camera_right, camera_up, -camera_view, eyes], dim=2)
+    elif mode == "opencv":
+        T[:, :3, :] = torch.stack([camera_right, -camera_up, camera_view, eyes], dim=2)
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+    T[:, 3, 3] = 1.0
+    return T.float()
+
+
+def sample_sphere_points(N, radius, device="cuda"):
+    latitude = (torch.rand(size=(N, 1), device=device) - 0.5) * torch.pi
+    longitude = (torch.rand(size=(N, 1), device=device) - 0.5) * torch.pi * 2
+    x = torch.cos(latitude) * torch.cos(longitude)
+    y = torch.sin(latitude) * torch.cos(longitude)
+    z = torch.sin(longitude)
+    pc = torch.cat([x, y, z], dim=1) * radius
+    return pc
+
+
+def sample_sphere_poses(N, origin, radius, device="cuda"):
+    eyes = sample_sphere_points(N, radius, device)
+    if not isinstance(origin, torch.Tensor):
+        origin = torch.tensor(origin).float().to(device)
+    ats = origin[None, :].repeat((N, 1))
+    poses_gl = lookat_to_cam_pose(eyes, ats, device=device, mode="opengl")
+    poses_cv = lookat_to_cam_pose(eyes, ats, device=device, mode="opencv")
+    return poses_gl, poses_cv
+
+
+def compute_origin_and_radius(trimesh_scene):
+    low, high = trimesh_scene.bounds
+    center = high + low / 2
+    low = low - center
+    high = high - center
+    radius = max(np.sqrt((high**2).sum()), np.sqrt((low**2).sum()))
+    return center, radius
+
+
+def render_batch(trimesh_mesh, camera_poses, fov, image_size):
+    camera_poses = camera_poses.detach().cpu().numpy()
+    mesh = pyrender.Mesh.from_trimesh(trimesh_mesh)
+    scene = pyrender.Scene()
+    scene.add(mesh)
+    camera = pyrender.PerspectiveCamera(yfov=fov, aspectRatio=1.0)
+    camera = pyrender.Node(camera=camera, matrix=np.eye(4))
+    scene.add_node(camera)
+
+    light = pyrender.SpotLight(
+        color=np.ones(3),
+        intensity=1.0,
+        innerConeAngle=np.pi / 16.0,
+        outerConeAngle=np.pi / 2.0,
+    )
+    light = pyrender.Node(light=light, matrix=np.eye(4))
+    scene.add_node(light)
+    r = pyrender.OffscreenRenderer(image_size, image_size)
+
+    colors = []
+    depths = []
+    for camera_pose in camera_poses:
+        scene.set_pose(camera, camera_pose)
+        scene.set_pose(light, camera_pose)
+        color, depth = r.render(scene)
+        colors.append(color)
+        depths.append(depth)
+
+    return np.asarray(colors), np.asarray(depths)
+
+
+"""
+MeshDataset takes a path to a mesh as input and uses PyRender to render images of the mesh
+from a sphere centered around the scene.
+"""
+
+
+class MeshDataset(Dataset):
+    def __init__(
+        self,
+        mesh_file: str = None,
+        n_frames: int = 10,
+        image_size: float = 256,
+        save_data_dir: Optional[str] = None,
+        trimesh_mesh: Optional[trimesh.Trimesh] = None,
+        fov_deg: int = 60,
+        # visible_point: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.mesh_file = mesh_file
+        self.n_frames = n_frames
+        if trimesh_mesh is None:
+            self.trimesh_mesh = trimesh.load(self.mesh_file)
+        else:
+            self.trimesh_mesh = trimesh_mesh
+        self.image_size = image_size
+
+        origin, radius = compute_origin_and_radius(self.trimesh_mesh)
+        self.fov_deg = fov_deg
+        sphere_radius = radius * 2.0
+        self.camera_poses_gl, self.camera_poses_cv = sample_sphere_poses(
+            n_frames, origin, sphere_radius, "cuda"
+        )
+        self.colors, self.depths = render_batch(
+            self.trimesh_mesh,
+            self.camera_poses_gl,
+            fov=math.radians(self.fov_deg),
+            image_size=self.image_size,
+        )
+        self.intrinsics = fov_and_size_to_intrinsics(
+            self.fov_deg, (self.image_size, self.image_size), device="cuda"
+        )
+
+        if save_data_dir is not None:
+            self.save_as_sun3d_dataset(save_data_dir)
+            # sys.exit(0)
+
+    def save_as_sun3d_dataset(self, output_dir):
+        import imageio
+        from transforms3d.quaternions import quat2mat
+
+        os.makedirs(output_dir, exist_ok=True)
+        K = self.intrinsics.detach().cpu().numpy().tolist()
+        intrinsics_text = f"""{K[0][0]} {K[0][1]} {K[0][2]}
+            {K[1][0]} {K[1][1]} {K[1][2]}
+            {K[2][0]} {K[2][1]} {K[2][2]}"""
+        with open(f"{output_dir}/camera-intrinsics.txt", "w") as fp:
+            fp.write(intrinsics_text)
+
+        seqdir = f"{output_dir}/seq-01"
+        os.makedirs(seqdir, exist_ok=True)
+
+        for i in range(len(self)):
+            data = self[i]
+            rgb = data["rgba"][:3, :, :].detach().cpu().permute(1, 2, 0).numpy()
+            depth = data["depth"]
+            depth = (depth * 1000).detach().cpu().numpy().astype(np.uint16)
+            nvblox_pose = data["pose"]
+
+            eigen_quat = [0.707106769, 0.707106769, 0, 0]
+            sun3d_to_nvblox_T = torch.eye(4)
+            sun3d_to_nvblox_T[:3, :3] = torch.tensor(quat2mat(eigen_quat))
+
+            sun3d_pose = torch.linalg.inv(sun3d_to_nvblox_T) @ nvblox_pose
+            P = sun3d_pose.detach().cpu().numpy().tolist()
+
+            pose_text = f"""{P[0][0]} {P[0][1]} {P[0][2]} {P[0][3]}
+                {P[1][0]} {P[1][1]} {P[1][2]} {P[1][3]}
+                {P[2][0]} {P[2][1]} {P[2][2]} {P[2][3]}
+                {P[3][0]} {P[3][1]} {P[3][2]} {P[3][3]}"""
+
+            framename = f"frame-{str(i).zfill(6)}"
+            imageio.imwrite(f"{seqdir}/{framename}.color.png", rgb)
+            imageio.imwrite(f"{seqdir}/{framename}.depth.png", depth)
+            with open(f"{seqdir}/{framename}.pose.txt", "w") as fp:
+                fp.write(pose_text)
+
+    def __len__(self):
+        return self.n_frames
+
+    def __getitem__(self, index):
+        rgb_np = self.colors[index]
+        depth_np = self.depths[index]
+        a_np = (depth_np > 0).astype(np.uint8) * 255
+        rgba_np = np.concatenate([rgb_np, a_np[:, :, None]], axis=2)
+        pose = self.camera_poses_cv[index]
+        intrinsics = self.intrinsics
+
+        depth_np = depth_np.astype(np.float32)
+        rgba = torch.from_numpy(rgba_np).permute((2, 0, 1))
+        depth = torch.from_numpy(depth_np).float()
+
+        return {"rgba": rgba, "depth": depth, "pose": pose, "intrinsics": intrinsics}
--- a/examples/robot_image_segmentation_example.py
+++ b/examples/robot_image_segmentation_example.py
@@ -19,7 +19,7 @@ import imageio
 import numpy as np
 import torch
 import torch.autograd.profiler as profiler
-from nvblox_torch.datasets.mesh_dataset import MeshDataset
+from mesh_dataset import MeshDataset
 from torch.profiler import ProfilerActivity, profile, record_function

 # CuRobo
@@ -79,7 +79,7 @@ def create_render_dataset(
    mesh_dataset = MeshDataset(
        None,
        n_frames=n_frames,
-        image_size=640,
+        image_size=1920,
        save_data_dir=None,
        trimesh_mesh=robot_mesh,
        fov_deg=fov_deg,
@@ -96,10 +96,14 @@ def mask_image(robot_file="ur5e.yml"):
    tensor_args = TensorDeviceType()

    curobo_segmenter = RobotSegmenter.from_robot_file(
-        robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True
+        robot_file,
+        collision_sphere_buffer=0.01,
+        distance_threshold=0.05,
+        use_cuda_graph=True,
+        ops_dtype=torch.float16,
    )

-    mesh_dataset, q_js = create_render_dataset(robot_file, write_pointcloud, n_frames=20)
+    mesh_dataset, q_js = create_render_dataset(robot_file, write_pointcloud, n_frames=30)

    if save_debug_data:
        visualize_scale = 10.0
@@ -179,9 +183,9 @@ def mask_image(robot_file="ur5e.yml"):
        )

    dt_list = []
+    for j in range(10):

        for i in range(len(mesh_dataset)):
-
            data = mesh_dataset[i]
            cam_obs = CameraObservation(
                depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
@@ -200,7 +204,11 @@ def mask_image(robot_file="ur5e.yml"):
            torch.cuda.synchronize()
            dt_list.append(time.time() - st_time)

-    print("Segmentation Time (ms), (hz)", np.mean(dt_list[5:]) * 1000.0, 1.0 / np.mean(dt_list[5:]))
+        print(
+            "Segmentation Time (ms), (hz)",
+            np.mean(dt_list[5:]) * 1000.0,
+            1.0 / np.mean(dt_list[5:]),
+        )


 def batch_mask_image(robot_file="ur5e.yml"):
@@ -216,7 +224,11 @@ def batch_mask_image(robot_file="ur5e.yml"):
    tensor_args = TensorDeviceType()

    curobo_segmenter = RobotSegmenter.from_robot_file(
-        robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True
+        robot_file,
+        collision_sphere_buffer=0.01,
+        distance_threshold=0.05,
+        use_cuda_graph=True,
+        ops_dtype=torch.float16,
    )

    mesh_dataset, q_js = create_render_dataset(robot_file, save_debug_data, fov_deg=60)
@@ -489,7 +501,8 @@ def profile_mask_image(robot_file="ur5e.yml"):


 if __name__ == "__main__":
-    mask_image()
+    robot_file = "quad_ur10e.yml"  # "ur10e.yml" #"ur5e_robotiq_2f_140.yml"
+    # mask_image(robot_file)
    # profile_mask_image()
-    # batch_mask_image()
+    batch_mask_image(robot_file)
    # batch_robot_mask_image()
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,8 +23,8 @@ license = NVIDIA

 # Configure general project settings
 long_description = file: README.md
-long-description-content-type = text/markdown
-license_file = LICENSE
+long_description_content_type = text/markdown
+license_files = LICENSE

 # List of classifiers can be found here:
 # https://pypi.org/classifiers/
--- a/src/curobo/curobolib/cpp/tensor_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
@@ -850,7 +850,12 @@ namespace Curobo
                     (1.333333333f) * g_acc[3] + (-0.083333333f) * g_acc[4]) * dt * dt +

 // ( g_acc[3] + g_acc[1] - (2.0) * g_acc[2]) * dt * dt +
-                    (-0.5f * g_jerk[0] + g_jerk[1] - g_jerk[3] + 0.5f * g_jerk[4]) * dt * dt * dt);
+                    //(-0.5f * g_jerk[0] + g_jerk[1] - g_jerk[3] + 0.5f * g_jerk[4]) * dt * dt * dt);
+                    (0.5f * g_jerk[0]  - g_jerk[1] + g_jerk[3] - 0.5f * g_jerk[4]) * dt * dt * dt);
+
+                    //(0.500000000000000 * g_jerk[0] - 1 * g_jerk[1] + 0 * g_jerk[2] + 1 * g_jerk[3] - 0.500000000000000 * g_jerk[4]) * dt_inv_3;
+
+
      }
      else if (hid == horizon - 3)
      {
--- a/src/curobo/geom/sdf/world_mesh.py
+++ b/src/curobo/geom/sdf/world_mesh.py
@@ -29,7 +29,7 @@ from curobo.geom.sdf.world import (
 from curobo.geom.types import Mesh, WorldConfig
 from curobo.types.math import Pose
 from curobo.util.logger import log_error, log_info, log_warn
-from curobo.util.warp import init_warp
+from curobo.util.warp import init_warp, warp_support_bvh_constructor_type


@dataclass(frozen=True)
@@ -149,6 +149,9 @@ class WorldMeshCollision(WorldPrimitiveCollision):
        verts, faces = mesh.get_mesh_data()
        v = wp.array(verts, dtype=wp.vec3, device=self._wp_device)
        f = wp.array(np.ravel(faces), dtype=int, device=self._wp_device)
+        if warp_support_bvh_constructor_type():
+            new_mesh = wp.Mesh(points=v, indices=f, bvh_constructor="sah")
+        else:
            new_mesh = wp.Mesh(points=v, indices=f)
        return WarpMeshData(mesh.name, new_mesh.id, v, f, new_mesh)

@@ -188,6 +191,7 @@ class WorldMeshCollision(WorldPrimitiveCollision):

            id_list[i] = m_data.m_id
            name_list.append(m_data.name)
+
        pose_buffer = Pose.from_batch_list(pose_list, self.tensor_args)
        inv_pose_buffer = pose_buffer.inverse()
        return name_list, id_list, inv_pose_buffer.get_pose_vector()
--- a/src/curobo/geom/transform.py
+++ b/src/curobo/geom/transform.py
@@ -20,7 +20,6 @@ import torch
 import warp as wp

 # CuRobo
-from curobo.curobolib.kinematics import rotation_matrix_to_quaternion
 from curobo.util.logger import log_error
 from curobo.util.torch_utils import get_torch_jit_decorator
 from curobo.util.warp import init_warp
@@ -174,31 +173,6 @@ def matrix_to_quaternion(
    """
    matrix = matrix.view(-1, 3, 3)
    out_quat = MatrixToQuaternion.apply(matrix, out_quat, adj_matrix)
-    # out_quat = cuda_matrix_to_quaternion(matrix)
-    return out_quat
-
-
-def cuda_matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
-    """Convert rotations given as rotation matrices to quaternions.
-
-    This is not differentiable. Use :func:`~matrix_to_quaternion` for differentiable conversion.
-    Args:
-        matrix: Rotation matrices as tensor of shape (..., 3, 3).
-
-    Returns:
-        quaternions with real part first, as tensor of shape (..., 4). [qw, qx,qy,qz]
-    """
-    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
-        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
-
-    # account for different batch shapes here:
-    in_shape = matrix.shape
-    mat_in = matrix.view(-1, 3, 3)
-
-    out_quat = torch.zeros((mat_in.shape[0], 4), device=matrix.device, dtype=matrix.dtype)
-    out_quat = rotation_matrix_to_quaternion(matrix, out_quat)
-    out_shape = list(in_shape[:-2]) + [4]
-    out_quat = out_quat.view(out_shape)
    return out_quat


@@ -521,6 +495,41 @@ def compute_batch_transform_point(
    out_pt[b_idx * n_pts + p_idx] = p


+@wp.kernel
+def compute_batch_transform_point_fp16(
+    position: wp.array(dtype=wp.vec3h),
+    quat: wp.array(dtype=wp.vec4h),
+    pt: wp.array(dtype=wp.vec3h),
+    n_pts: wp.int32,
+    n_poses: wp.int32,
+    out_pt: wp.array(dtype=wp.vec3h),
+):
+    """A warp kernel to transform batch of points by batch of poses."""
+
+    # given n,3 points and b poses, get b,n,3 transformed points
+    # we tile as
+    tid = wp.tid()
+    b_idx = tid / (n_pts)
+    p_idx = tid - (b_idx * n_pts)
+
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+    in_pt = pt[b_idx * n_pts + p_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    q_vec = wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0])
+    t = wp.transformh(in_position, q_vec)
+
+    # transform a point
+    p = wp.transform_point(t, in_pt)
+
+    # write pt:
+    out_pt[b_idx * n_pts + p_idx] = p
+
+
@wp.kernel
 def compute_batch_pose_multipy(
    position: wp.array(dtype=wp.vec3),
@@ -654,7 +663,8 @@ class TransformPoint(torch.autograd.Function):
                b,
            ],
            outputs=[wp.from_torch(out_points.view(-1, 3), dtype=wp.vec3)],
-            stream=wp.stream_from_torch(position.device),
+            stream=None if not position.is_cuda else wp.stream_from_torch(position.device),
+            device=wp.device_from_torch(position.device),
        )

        return out_points
@@ -679,7 +689,9 @@ class TransformPoint(torch.autograd.Function):

        wp_adj_position = wp.from_torch(adj_position, dtype=wp.vec3)
        wp_adj_quat = wp.from_torch(adj_quaternion, dtype=wp.vec4)
-
+        stream = None
+        if position.is_cuda:
+            stream = wp.stream_from_torch(position.device)
        wp.launch(
            kernel=compute_transform_point,
            dim=ctx.b * ctx.n,
@@ -707,8 +719,9 @@ class TransformPoint(torch.autograd.Function):
            adj_outputs=[
                None,
            ],
-            stream=wp.stream_from_torch(grad_output.device),
            adjoint=True,
+            stream=None if not grad_output.is_cuda else wp.stream_from_torch(grad_output.device),
+            device=wp.device_from_torch(grad_output.device),
        )
        g_p = g_q = g_pt = None
        if ctx.needs_input_grad[0]:
@@ -742,19 +755,44 @@ class BatchTransformPoint(torch.autograd.Function):
        )
        ctx.b = b
        ctx.n = n
+        if points.dtype == torch.float32:
            wp.launch(
                kernel=compute_batch_transform_point,
                dim=b * n,
                inputs=[
-                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                    wp.from_torch(
+                        position.detach().view(-1, 3).contiguous(),
+                        dtype=wp.types.vector(length=3, dtype=wp.float32),
+                    ),
                    wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
                    wp.from_torch(points.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
                    n,
                    b,
                ],
                outputs=[wp.from_torch(out_points.view(-1, 3).contiguous(), dtype=wp.vec3)],
-            stream=wp.stream_from_torch(position.device),
+                stream=None if not position.is_cuda else wp.stream_from_torch(position.device),
+                device=wp.device_from_torch(position.device),
            )
+        elif points.dtype == torch.float16:
+            wp.launch(
+                kernel=compute_batch_transform_point_fp16,
+                dim=b * n,
+                inputs=[
+                    wp.from_torch(
+                        position.detach().view(-1, 3).contiguous(),
+                        dtype=wp.types.vector(length=3, dtype=wp.float16),
+                    ),
+                    wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4h),
+                    wp.from_torch(points.detach().view(-1, 3).contiguous(), dtype=wp.vec3h),
+                    n,
+                    b,
+                ],
+                outputs=[wp.from_torch(out_points.view(-1, 3).contiguous(), dtype=wp.vec3h)],
+                stream=None if not position.is_cuda else wp.stream_from_torch(position.device),
+                device=wp.device_from_torch(position.device),
+            )
+        else:
+            log_error("Unsupported dtype: " + str(points.dtype))

        return out_points

@@ -804,8 +842,9 @@ class BatchTransformPoint(torch.autograd.Function):
            adj_outputs=[
                None,
            ],
-            stream=wp.stream_from_torch(grad_output.device),
            adjoint=True,
+            stream=None if not grad_output.is_cuda else wp.stream_from_torch(grad_output.device),
+            device=wp.device_from_torch(grad_output.device),
        )
        g_p = g_q = g_pt = None
        if ctx.needs_input_grad[0]:
@@ -876,7 +915,8 @@ class BatchTransformPose(torch.autograd.Function):
                wp.from_torch(out_position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
            ],
-            stream=wp.stream_from_torch(position.device),
+            stream=None if not position.is_cuda else wp.stream_from_torch(position.device),
+            device=wp.device_from_torch(position.device),
        )

        return out_position, out_quaternion
@@ -949,8 +989,13 @@ class BatchTransformPose(torch.autograd.Function):
                None,
                None,
            ],
-            stream=wp.stream_from_torch(grad_out_position.device),
            adjoint=True,
+            stream=(
+                None
+                if not grad_out_position.is_cuda
+                else wp.stream_from_torch(grad_out_position.device)
+            ),
+            device=wp.device_from_torch(grad_out_position.device),
        )
        g_p1 = g_q1 = g_p2 = g_q2 = None
        if ctx.needs_input_grad[0]:
@@ -1010,7 +1055,7 @@ class TransformPose(torch.autograd.Function):
        )
        ctx.b = b
        wp.launch(
-            kernel=compute_batch_pose_multipy,
+            kernel=compute_pose_multipy,
            dim=b,
            inputs=[
                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
@@ -1022,7 +1067,8 @@ class TransformPose(torch.autograd.Function):
                wp.from_torch(out_position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
            ],
-            stream=wp.stream_from_torch(position.device),
+            stream=(None if not position.is_cuda else wp.stream_from_torch(position.device)),
+            device=wp.device_from_torch(position.device),
        )

        return out_position, out_quaternion
@@ -1095,7 +1141,12 @@ class TransformPose(torch.autograd.Function):
                None,
                None,
            ],
-            stream=wp.stream_from_torch(grad_out_position.device),
+            stream=(
+                None
+                if not grad_out_position.is_cuda
+                else wp.stream_from_torch(grad_out_position.device)
+            ),
+            device=wp.device_from_torch(grad_out_position.device),
            adjoint=True,
        )
        g_p1 = g_q1 = g_p2 = g_q2 = None
@@ -1155,7 +1206,8 @@ class PoseInverse(torch.autograd.Function):
                wp.from_torch(out_position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
            ],
-            stream=wp.stream_from_torch(position.device),
+            stream=(None if not position.is_cuda else wp.stream_from_torch(position.device)),
+            device=wp.device_from_torch(position.device),
        )

        return out_position, out_quaternion
@@ -1212,8 +1264,11 @@ class PoseInverse(torch.autograd.Function):
                None,
                None,
            ],
-            stream=wp.stream_from_torch(grad_out_position.device),
            adjoint=True,
+            stream=(
+                None if not out_position.is_cuda else wp.stream_from_torch(out_position.device)
+            ),
+            device=wp.device_from_torch(out_position.device),
        )
        g_p1 = g_q1 = None
        if ctx.needs_input_grad[0]:
@@ -1260,7 +1315,8 @@ class QuatToMatrix(torch.autograd.Function):
            outputs=[
                wp.from_torch(out_mat.detach().view(-1, 3, 3).contiguous(), dtype=wp.mat33),
            ],
-            stream=wp.stream_from_torch(quaternion.device),
+            stream=(None if not quaternion.is_cuda else wp.stream_from_torch(quaternion.device)),
+            device=wp.device_from_torch(quaternion.device),
        )

        return out_mat
@@ -1299,8 +1355,11 @@ class QuatToMatrix(torch.autograd.Function):
            adj_outputs=[
                None,
            ],
-            stream=wp.stream_from_torch(grad_out_mat.device),
            adjoint=True,
+            stream=(
+                None if not grad_out_mat.is_cuda else wp.stream_from_torch(grad_out_mat.device)
+            ),
+            device=wp.device_from_torch(grad_out_mat.device),
        )
        g_q1 = None
        if ctx.needs_input_grad[0]:
@@ -1345,7 +1404,8 @@ class MatrixToQuaternion(torch.autograd.Function):
            outputs=[
                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
            ],
-            stream=wp.stream_from_torch(in_mat.device),
+            stream=(None if not in_mat.is_cuda else wp.stream_from_torch(in_mat.device)),
+            device=wp.device_from_torch(in_mat.device),
        )

        return out_quaternion
@@ -1382,8 +1442,9 @@ class MatrixToQuaternion(torch.autograd.Function):
            adj_outputs=[
                None,
            ],
-            stream=wp.stream_from_torch(grad_out_q.device),
            adjoint=True,
+            stream=(None if not in_mat.is_cuda else wp.stream_from_torch(in_mat.device)),
+            device=wp.device_from_torch(in_mat.device),
        )
        g_q1 = None
        if ctx.needs_input_grad[0]:
--- a/src/curobo/opt/newton/lbfgs.py
+++ b/src/curobo/opt/newton/lbfgs.py
@@ -117,6 +117,7 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig):
        if self.history > self.d_opt:
            log_info("LBFGS: history >= d_opt, reducing history to d_opt-1")
            self.history = self.d_opt - 1
+            self.init_hessian(self.n_problems)

    @profiler.record_function("lbfgs/reset")
    def reset(self):
--- a/src/curobo/opt/particle/parallel_mppi.py
+++ b/src/curobo/opt/particle/parallel_mppi.py
@@ -642,7 +642,7 @@ def jit_diag_a_cov_update(w, actions, mean_action):
    # weighted_delta =
    # sum across horizon and mean across particles:
    # cov_update = torch.diag(torch.mean(torch.sum(weighted_delta.T  , dim=0), dim=0))
-    cov_update = torch.mean(torch.sum(weighted_delta, dim=-2), dim=-2).unsqueeze(-2)
+    cov_update = torch.mean(torch.sum(weighted_delta, dim=-3), dim=-2).unsqueeze(-2)
    return cov_update


--- a/src/curobo/rollout/cost/init.py
+++ b/src/curobo/rollout/cost/init.py
@@ -9,5 +9,4 @@
 # its affiliates is strictly prohibited.
 #

-"""
-"""
+""" """
--- a/src/curobo/util/warp.py
+++ b/src/curobo/util/warp.py
@@ -67,6 +67,21 @@ def warp_support_kernel_key(wp_module=None):
    return True


+def warp_support_bvh_constructor_type(wp_module=None):
+    if wp_module is None:
+        wp_module = wp
+    wp_version = wp_module.config.version
+
+    if version.parse(wp_version) < version.parse("1.6.0"):
+        log_info(
+            "Warp version is "
+            + wp_version
+            + " < 1.6.0, using, creating global constant to trigger kernel generation."
+        )
+        return False
+    return True
+
+
 def is_runtime_warp_kernel_enabled() -> bool:
    env_variable = os.environ.get("CUROBO_WARP_RUNTIME_KERNEL_DISABLE")
    if env_variable is None:
--- a/src/curobo/wrap/model/robot_segmenter.py
+++ b/src/curobo/wrap/model/robot_segmenter.py
@@ -25,7 +25,6 @@ from curobo.types.robot import RobotConfig
 from curobo.types.state import JointState
 from curobo.util.logger import log_error
 from curobo.util.torch_utils import (
-    get_torch_compile_options,
    get_torch_jit_decorator,
    is_torch_compile_available,
 )
@@ -63,6 +62,8 @@ class RobotSegmenter:
        distance_threshold: float = 0.05,
        use_cuda_graph: bool = True,
        tensor_args: TensorDeviceType = TensorDeviceType(),
+        ops_dtype: torch.dtype = torch.float32,
+        depth_to_meter: float = 0.001,
    ):
        robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
        if collision_sphere_buffer is not None:
@@ -79,7 +80,11 @@ class RobotSegmenter:
        robot_world = RobotWorld(config)

        return RobotSegmenter(
-            robot_world, distance_threshold=distance_threshold, use_cuda_graph=use_cuda_graph
+            robot_world,
+            distance_threshold=distance_threshold,
+            use_cuda_graph=use_cuda_graph,
+            ops_dtype=ops_dtype,
+            depth_to_meter=depth_to_meter,
        )

    def get_pointcloud_from_depth(self, camera_obs: CameraObservation):
@@ -171,18 +176,23 @@ class RobotSegmenter:
        robot_spheres = self._robot_world.get_kinematics(q).link_spheres_tensor

        points = self.get_pointcloud_from_depth(camera_obs)
-        camera_to_robot = camera_obs.pose
-        points = points.to(dtype=torch.float32)
+        camera_to_robot = camera_obs.pose.to(TensorDeviceType(dtype=self._ops_dtype))

        if self._out_points_buffer is None:
            self._out_points_buffer = points.clone()
        if self._out_gpt is None:
-            self._out_gpt = torch.zeros((points.shape[0], points.shape[1], 3), device=points.device)
+            self._out_gpt = torch.zeros(
+                (points.shape[0], points.shape[1], 3), device=points.device, dtype=self._ops_dtype
+            )
        if self._out_gp is None:
-            self._out_gp = torch.zeros((camera_to_robot.position.shape[0], 3), device=points.device)
+            self._out_gp = torch.zeros(
+                (camera_to_robot.position.shape[0], 3), device=points.device, dtype=self._ops_dtype
+            )
        if self._out_gq is None:
            self._out_gq = torch.zeros(
-                (camera_to_robot.quaternion.shape[0], 4), device=points.device
+                (camera_to_robot.quaternion.shape[0], 4),
+                device=points.device,
+                dtype=self._ops_dtype,
            )

        points_in_robot_frame = camera_to_robot.batch_transform_points(
@@ -193,10 +203,20 @@ class RobotSegmenter:
            gpt_out=self._out_gpt,
        )

-        out_points = points_in_robot_frame
-
+        robot_spheres = robot_spheres.to(dtype=self._ops_dtype)
+        if is_torch_compile_available():
            mask, filtered_image = mask_spheres_image(
-            camera_obs.depth_image, robot_spheres, out_points, self.distance_threshold
+                camera_obs.depth_image,
+                robot_spheres,
+                points_in_robot_frame,
+                self.distance_threshold,
+            )
+        else:
+            mask, filtered_image = mask_spheres_image_cdist(
+                camera_obs.depth_image,
+                robot_spheres,
+                points_in_robot_frame,
+                self.distance_threshold,
            )

        return mask, filtered_image
@@ -246,8 +266,11 @@ def mask_spheres_image(

    robot_radius = robot_spheres[..., 3]
    points = points.unsqueeze(-2)
+
+    robot_points = robot_spheres[..., :3]
+
    sph_distance = -1 * (
-        torch.linalg.norm(points - robot_spheres[..., :3], dim=-1) - robot_radius
+        torch.linalg.norm(points - robot_points, dim=-1) - robot_radius
    )  # b, n_spheres
    distance = torch.max(sph_distance, dim=-1)[0]

@@ -259,3 +282,39 @@ def mask_spheres_image(
    mask = torch.logical_and((image > 0.0), (distance > -distance_threshold))
    filtered_image = torch.where(mask, 0, image)
    return mask, filtered_image
+
+
+@get_torch_jit_decorator()
+def mask_spheres_image_cdist(
+    image: torch.Tensor,
+    link_spheres_tensor: torch.Tensor,
+    world_points: torch.Tensor,
+    distance_threshold: float,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    if link_spheres_tensor.shape[0] != 1:
+        assert link_spheres_tensor.shape[0] == world_points.shape[0]
+    if len(world_points.shape) == 2:
+        world_points = world_points.unsqueeze(0)
+
+    robot_spheres = link_spheres_tensor.view(link_spheres_tensor.shape[0], -1, 4).contiguous()
+
+    robot_spheres = robot_spheres.expand(world_points.shape[0], -1, -1)
+    robot_points = robot_spheres[..., :3]
+
+    robot_radius = robot_spheres[..., 3].unsqueeze(1)
+
+    sph_distance = torch.cdist(world_points, robot_points, p=2.0)
+    sph_distance = -1.0 * (sph_distance - robot_radius)
+
+    distance = torch.max(sph_distance, dim=-1)[0]
+
+    distance = distance.view(
+        image.shape[0],
+        image.shape[1],
+        image.shape[2],
+    )
+    distance_mask = distance > -distance_threshold
+    mask = torch.logical_and((image > 0.0), distance_mask)
+    filtered_image = torch.where(mask, 0, image)
+    return mask, filtered_image