From b1f63e8778b9954ed1296f249aceb202ab05e10b Mon Sep 17 00:00:00 2001 From: Balakumar Sundaralingam Date: Mon, 18 Mar 2024 11:19:48 -0700 Subject: [PATCH] Significantly improved convergence for mesh and cuboid, new ESDF collision. --- CHANGELOG.md | 55 +- benchmark/README.md | 47 +- benchmark/curobo_benchmark.py | 121 +- benchmark/curobo_profile.py | 10 +- benchmark/curobo_voxel_benchmark.py | 660 ++++ benchmark/curobo_voxel_profile.py | 305 ++ benchmark/ik_benchmark.py | 9 +- docker/aarch64.dockerfile | 4 +- docker/user.dockerfile | 4 +- examples/ik_example.py | 16 +- examples/isaac_sim/helper.py | 5 +- examples/kinematics_example.py | 4 +- examples/mpc_example.py | 4 +- examples/robot_image_segmentation_example.py | 295 +- setup.cfg | 4 +- setup.py | 28 +- src/curobo/__init__.py | 6 +- .../robot/franka_description/franka_test.urdf | 1 + src/curobo/content/configs/robot/franka.yml | 2 +- src/curobo/content/configs/robot/iiwa.yml | 8 +- .../content/configs/robot/iiwa_allegro.yml | 19 + .../configs/robot/spheres/franka_mesh.yml | 8 +- .../content/configs/robot/spheres/ur10e.yml | 2 +- src/curobo/content/configs/robot/ur10e.yml | 2 +- src/curobo/content/configs/robot/ur5e.yml | 3 +- .../content/configs/task/finetune_trajopt.yml | 12 +- .../content/configs/task/gradient_ik.yml | 8 +- .../content/configs/task/gradient_trajopt.yml | 12 +- src/curobo/content/configs/task/graph.yml | 2 +- .../content/configs/task/particle_ik.yml | 4 +- .../content/configs/task/particle_mpc.yml | 2 +- .../content/configs/task/particle_trajopt.yml | 10 +- .../cuda_robot_model/cuda_robot_model.py | 2 +- src/curobo/cuda_robot_model/types.py | 16 +- .../urdf_kinematics_parser.py | 6 +- src/curobo/curobolib/cpp/geom_cuda.cpp | 102 +- src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp | 89 +- src/curobo/curobolib/cpp/lbfgs_step_kernel.cu | 1157 ++++--- .../curobolib/cpp/line_search_kernel.cu | 1 - .../curobolib/cpp/self_collision_kernel.cu | 18 +- src/curobo/curobolib/cpp/sphere_obb_kernel.cu | 2761 +++++++++++++---- .../curobolib/cpp/tensor_step_kernel.cu | 2 - src/curobo/curobolib/geom.py | 215 +- src/curobo/curobolib/opt.py | 36 +- src/curobo/geom/cv.py | 20 +- src/curobo/geom/sdf/sdf_grid.py | 6 +- src/curobo/geom/sdf/utils.py | 7 +- src/curobo/geom/sdf/warp_primitives.py | 787 ++--- src/curobo/geom/sdf/world.py | 259 +- src/curobo/geom/sdf/world_blox.py | 14 +- src/curobo/geom/sdf/world_mesh.py | 28 +- src/curobo/geom/sdf/world_voxel.py | 699 +++++ src/curobo/geom/transform.py | 19 +- src/curobo/geom/types.py | 105 +- src/curobo/graph/graph_base.py | 17 +- src/curobo/opt/newton/lbfgs.py | 74 +- src/curobo/opt/newton/newton_base.py | 25 +- src/curobo/opt/particle/parallel_es.py | 72 +- src/curobo/opt/particle/parallel_mppi.py | 156 +- src/curobo/opt/particle/particle_opt_base.py | 2 +- src/curobo/opt/particle/particle_opt_utils.py | 5 +- src/curobo/rollout/arm_base.py | 20 +- src/curobo/rollout/arm_reacher.py | 28 +- src/curobo/rollout/cost/bound_cost.py | 5 +- src/curobo/rollout/cost/dist_cost.py | 13 +- .../rollout/cost/primitive_collision_cost.py | 60 +- src/curobo/rollout/cost/stop_cost.py | 3 +- src/curobo/rollout/cost/straight_line_cost.py | 5 +- src/curobo/rollout/cost/zero_cost.py | 11 +- .../dynamics_model/integration_utils.py | 29 +- .../rollout/dynamics_model/tensor_step.py | 3 +- src/curobo/rollout/rollout_base.py | 9 +- src/curobo/types/base.py | 8 +- src/curobo/types/camera.py | 93 +- src/curobo/types/math.py | 24 +- src/curobo/types/state.py | 125 +- src/curobo/types/tensor.py | 62 +- src/curobo/util/sample_lib.py | 3 +- src/curobo/util/tensor_util.py | 15 +- src/curobo/util/torch_utils.py | 115 +- src/curobo/util/trajectory.py | 49 +- src/curobo/util_file.py | 1 + src/curobo/wrap/model/robot_segmenter.py | 60 +- src/curobo/wrap/model/robot_world.py | 58 +- src/curobo/wrap/reacher/evaluator.py | 14 +- src/curobo/wrap/reacher/ik_solver.py | 5 +- src/curobo/wrap/reacher/motion_gen.py | 16 +- src/curobo/wrap/reacher/trajopt.py | 226 +- tests/cost_test.py | 2 +- tests/geom_test.py | 18 +- tests/interpolation_test.py | 10 +- tests/kinematics_test.py | 30 +- tests/motion_gen_module_test.py | 2 +- tests/nvblox_test.py | 25 +- tests/pose_reaching_test.py | 4 +- tests/robot_segmentation_test.py | 209 ++ tests/voxel_collision_test.py | 254 ++ tests/voxelization_test.py | 181 ++ tests/warp_mesh_test.py | 6 +- tests/world_config_test.py | 3 +- 100 files changed, 7587 insertions(+), 2589 deletions(-) create mode 100644 benchmark/curobo_voxel_benchmark.py create mode 100644 benchmark/curobo_voxel_profile.py create mode 100644 src/curobo/geom/sdf/world_voxel.py create mode 100644 tests/robot_segmentation_test.py create mode 100644 tests/voxel_collision_test.py create mode 100644 tests/voxelization_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f9e37da..efa27e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ its affiliates is strictly prohibited. --> # Changelog -## Version 0.6.3 +## Version 0.7.0 ### Changes in default behavior - Increased default collision cache to 50 in RobotWorld. - Changed `CSpaceConfig.position_limit_clip` default to 0 as previous default of 0.01 can make @@ -18,6 +18,11 @@ default start state in examples be out of bounds. - MotionGen uses parallel_finetune by default. To get previous motion gen behavior, pass `warmup(parallel_finetune=False)` and `MotionGenPlanConfig(parallel_finetune=False)`. - MotionGen loads Mesh Collision checker instead of Primitive by default. +- UR10e and UR5e now don't have a collision sphere at tool frame for world collision checking. This +sphere is only active for self collision avoidance. +- With torch>=2.0, cuRobo will use `torch.compile` instead of `torch.jit.script` to generate fused +kernels. This can take several seconds during the first run. To enable this feature, set +environment variable `export CUROBO_TORCH_COMPILE_DISABLE=0`. ### Breaking Changes - Renamed `copy_if_not_none` to `clone_if_not_none` to be more descriptive. Now `copy_if_not_none` @@ -26,6 +31,9 @@ will try to copy data into reference. parallel problems in optimization. - Added more inputs to pose distance kernels. Check `curobolib/geom.py`. - Pose cost `run_vec_weight` should now be `[0,0,0,0,0,0]` instead of `[1,1,1,1,1,1]` +- ``max_distance`` is now tensor from ``float`` and is an input to collision kernels. +- Order of inputs to ``SweptSdfMeshWarpPy`` has changed. + ### New Features - Add function to disable and enable collision for specific links in KinematicsTensorConfig. @@ -37,6 +45,19 @@ calling plan_single after warmup of goalset. add error when `velocity_scale<0.1`. - Add experimental robot image segmentation module to enable robot removal in depth images. - Add constrained planning mode to motion_gen. +- Use `torch.compile` to leverage better kernel fusion in place of `torch.jit.script`. +- Significantly improved collision computation for cuboids and meshes. Mesh collision checker is +now only 2x slower than cuboid (from 5x slower). Optimization convergence is also improved. +- LBFGS kernels now support ``history <= 31`` from ``history <= 15``. +- 2x faster LBFGS kernel that allocates upto 68kb of shared memory, preventing use in CUDA devices +with compute capability ``<7.0``. +- On benchmarking Dataset, Planning time is now 42ms on average from 50ms. Higher quality solutions +are also obtained. See [benchmarks](https://curobo.org/source/getting_started/4_benchmarks.html) for more details. +- Add ``WorldCollisionVoxel``, a new collision checking implementation that uses a voxel grid +of signed distances (SDF) to compute collision avoidance metrics. Documentation coming soon, see +``benchmark/curobo_voxel_benchmark.py`` for an example. +- Add API for ESDF computation from world representations, see +``WorldCollision.get_esdf_in_bounding_box()``. ### BugFixes & Misc. - refactored wp.index() instances to `[]` to avoid errors in future releases of warp. @@ -47,7 +68,7 @@ BATCH_GOALSET. - Added package data to also export `.so` files. - Fixed bug in transforming link visual mesh offset when reading from urdf. - Fixed bug in MotionGenPlanConfig.clone() that didn't clone the state of parallel_finetune. -- Increased weighting from 1.0 to 5.0 for optimized_dt in TrajEvaluator to select shorter +- Increased weighting from 1.0 to 10.0 for optimized_dt in TrajEvaluator to select shorter trajectories. - Improved determinism by setting global seed for random in `graph_nx.py`. - Added option to clear obstacles in WorldPrimitiveCollision. @@ -57,10 +78,24 @@ is enabled. graph. - plan_goalset will pad for extra goals when called with less number of goal than initial creation. - Improved API documentation for Optimizer class. -- Improved benchmark timings, now within 15ms of results reported in technical report. Added -numbers to benchmark [webpage](https://curobo.org/source/getting_started/4_benchmarks.html) for -easy reference. - Set `use_cuda_graph` to `True` as default from `None` in `MotionGenConfig.load_from_robot_config` +- Add batched mode to robot image segmentation, supports single robot multiple camera and batch +robot batch camera. +- Add `log_warn` import to `arm_reacher.py` +- Remove negative radius check in self collision kernel to allow for self collision checking with +spheres of negative radius. +- Added `conftest.py` to disable `torch.compile` for tests. +- Added UR5e robot with robotiq gripper (2f-140) with improved sphere model. +- Fix bug in aarch64.dockerfile where curobo was cloned to wrong path. +- Fix bug in aarch64.dockerfile where python was used instead of python3. +- Remove unused variables in kernels. +- Added ``pybind11`` as a dependency as some pytorch dockers for Jetson do not have this installed. +- Fix incorrect dimensions in ``MotionGenResult.success`` in ``MotionGen.plan_batch()`` when +trajectory optimization fails. +- Added unit tests for collision checking functions. +- Fix bug in linear interpolation which was not reading the new ``optimized_dt`` to interpolate +velocity, acceleration, and jerk. + ### Known Bugs (WIP) - Examples don't run in Isaac Sim 2023.1.1 due to behavior change in urdf importer. @@ -104,11 +139,11 @@ fails. ### Performance Regressions - cuRobo now generates significantly shorter paths then previous version. E.g., cuRobo obtains -2.2 seconds 98th percentile motion time on the 2600 problems (`benchmark/curobo_benchmark.py`), where -previously it was at 3 seconds (1.36x quicker motions). This was obtained by retuning the weights and -slight reformulations of trajectory optimization. These changes have led to a slight degrade in -planning time, 20ms slower on 4090 and 40ms on ORIN MAXN. We will address this slow down in a later -release. One way to avoid this regression is to set `finetune_dt_scale=1.05` in +2.2 seconds 98th percentile motion time on the 2600 problems (`benchmark/curobo_benchmark.py`), +where previously it was at 3 seconds (1.36x quicker motions). This was obtained by retuning the +weights and slight reformulations of trajectory optimization. These changes have led to a slight +degrade in planning time, 20ms slower on 4090 and 40ms on ORIN MAXN. We will address this slow down +in a later release. One way to avoid this regression is to set `finetune_dt_scale=1.05` in `MotionGenConfig.load_from_robot_config()`. diff --git a/benchmark/README.md b/benchmark/README.md index 77c4937..ce59462 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -10,49 +10,4 @@ its affiliates is strictly prohibited. --> This folder contains scripts to run the motion planning benchmarks. -Refer to Benchmarks & Profiling instructions: https://curobo.org/source/getting_started/4_benchmarks.html. - -Results in the arxiv paper were obtained from v0.6.0. - -v0.6.2+ has significant changes to improve motion quality with lower motion time, lower path length, higher pose accuracy (<1mm). v0.6.2+ sacrifices 15ms (RTX 4090) of compute time to achieve signficantly better solutions. The new results are yet to be added to the technical report. For now, to get the latest benchmark results follow instructions here: https://curobo.org/source/getting_started/4_benchmarks.html. - -To get results similar to in the technical report, pass `--report_edition` to `curobo_benchmark.py`. - - -# Latest Results (Feb 2024) - -Motion Generation on 2600 problems from motion benchmaker and motion policy networks, gives the -following values on a RTX 4090: - -| Metric | Value | -|-------------------|--------------------------------------------------------------| -|Success % | 99.84 | -|Plan Time (s) | mean: 0.068 ± 0.158 median:0.042 75%: 0.055 98%: 0.246 | -|Motion Time (s) | mean: 1.169 ± 0.360 median:1.140 75%: 1.381 98%: 2.163 | -|Path Length (rad.) | mean: 3.177 ± 1.072 median:3.261 75%: 3.804 98%: 5.376 | -|Jerk | mean: 97.700 ± 48.630 median:88.993 75%: 126.092 98%: 199.540| -|Position Error (mm)| mean: 0.119 ± 0.341 median:0.027 75%: 0.091 98%: 1.039 | - - -## Motion Benchmaker (800 problems): - -| Metric | Value | -|-------------------|--------------------------------------------------------------| -|Success % | 100 | -|Plan Time (s) | mean: 0.063 ± 0.137 median:0.042 75%: 0.044 98%: 0.206 | -|Motion Time (s) | mean: 1.429 ± 0.330 median:1.392 75%: 1.501 98%: 2.473 | -|Path Length (rad.) | mean: 3.956 ± 0.783 median:3.755 75%: 4.352 98%: 6.041 | -|Jerk | mean: 67.284 ± 27.788 median:61.853 75%: 83.337 98%: 143.118 | -|Position Error (mm)| mean: 0.079 ± 0.139 median:0.032 75%: 0.077 98%: 0.472 | - - -## Motion Policy Networks (1800 problems): - -| Metric | Value | -|-------------------|-----------------------------------------------------------------| -|Success % | 99.77 | -|Plan Time (s) | mean: 0.068 ± 0.117 median:0.042 75%: 0.059 98%: 0.243 | -|Motion Time (s) | mean: 1.051 ± 0.306 median:1.016 75%: 1.226 98%: 1.760 | -|Path Length (rad.) | mean: 2.829 ± 1.000 median:2.837 75%: 3.482 98%: 4.905 | -|Jerk | mean: 110.610 ± 47.586 median:105.271 75%: 141.517 98%: 217.158 | -|Position Error (mm)| mean: 0.122 ± 0.343 median:0.024 75%: 0.095 98%: 1.114 | +Refer to Benchmarks & Profiling page for latest resutls: https://curobo.org/source/getting_started/4_benchmarks.html. \ No newline at end of file diff --git a/benchmark/curobo_benchmark.py b/benchmark/curobo_benchmark.py index 944282b..74cd07f 100644 --- a/benchmark/curobo_benchmark.py +++ b/benchmark/curobo_benchmark.py @@ -45,8 +45,8 @@ torch.manual_seed(2) torch.backends.cudnn.benchmark = True -torch.backends.cuda.matmul.allow_tf32 = False -torch.backends.cudnn.allow_tf32 = False +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True np.random.seed(2) random.seed(2) @@ -144,6 +144,7 @@ def load_curobo( collision_activation_distance: float = 0.02, args=None, parallel_finetune=False, + ik_seeds=None, ): robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"] robot_cfg["kinematics"]["collision_sphere_buffer"] = collision_buffer @@ -152,12 +153,14 @@ def load_curobo( robot_cfg["kinematics"]["ee_link"] = "panda_hand" # del robot_cfg["kinematics"] + if ik_seeds is None: + ik_seeds = 24 - ik_seeds = 30 # 500 if graph_mode: trajopt_seeds = 4 - if trajopt_seeds >= 14: - ik_seeds = max(100, trajopt_seeds * 2) + collision_activation_distance = 0.0 + if trajopt_seeds >= 16: + ik_seeds = 100 if mpinets: robot_cfg["kinematics"]["lock_joints"] = { "panda_finger_joint1": 0.025, @@ -181,11 +184,11 @@ def load_curobo( K = robot_cfg_instance.kinematics.kinematics_config.joint_limits K.position[0, :] -= 0.2 K.position[1, :] += 0.2 - finetune_iters = 300 + finetune_iters = 200 grad_iters = None if args.report_edition: finetune_iters = 200 - grad_iters = 125 + grad_iters = 100 motion_gen_config = MotionGenConfig.load_from_robot_config( robot_cfg_instance, world_cfg, @@ -207,10 +210,11 @@ def load_curobo( collision_activation_distance=collision_activation_distance, trajopt_dt=0.25, finetune_dt_scale=finetune_dt_scale, - maximum_trajectory_dt=0.16, + maximum_trajectory_dt=0.15, ) mg = MotionGen(motion_gen_config) mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=parallel_finetune) + return mg, robot_cfg @@ -227,24 +231,20 @@ def benchmark_mb( # load dataset: force_graph = False - interpolation_dt = 0.02 - # mpinets_data = True - # if mpinets_data: - file_paths = [motion_benchmaker_raw, mpinets_raw][:] + file_paths = [motion_benchmaker_raw, mpinets_raw] if args.demo: file_paths = [demo_raw] - # else:22 - # file_paths = [get_mb_dataset_path()][:1] enable_debug = save_log or plot_cost all_files = [] og_tsteps = 32 if override_tsteps is not None: og_tsteps = override_tsteps og_finetune_dt_scale = 0.9 - og_trajopt_seeds = 12 - og_parallel_finetune = not args.jetson + og_trajopt_seeds = 4 + og_parallel_finetune = True og_collision_activation_distance = 0.01 + og_ik_seeds = None for file_path in file_paths: all_groups = [] mpinets_data = False @@ -258,50 +258,25 @@ def benchmark_mb( trajopt_seeds = og_trajopt_seeds collision_activation_distance = og_collision_activation_distance parallel_finetune = og_parallel_finetune + ik_seeds = og_ik_seeds + if "cage_panda" in key: + trajopt_seeds = 8 + + if "table_under_pick_panda" in key: + trajopt_seeds = 8 + finetune_dt_scale = 0.95 + + if key == "cubby_task_oriented": # or key == "merged_cubby_task_oriented": trajopt_seeds = 16 finetune_dt_scale = 0.95 - parallel_finetune = True - if "table_under_pick_panda" in key: - tsteps = 40 - trajopt_seeds = 24 - finetune_dt_scale = 0.95 - parallel_finetune = True - if "table_pick_panda" in key: - collision_activation_distance = 0.005 - - if "cubby_task_oriented" in key and "merged" not in key: - trajopt_seeds = 24 - finetune_dt_scale = 0.95 - collision_activation_distance = 0.015 - parallel_finetune = True if "dresser_task_oriented" in key: - trajopt_seeds = 24 - # tsteps = 40 + trajopt_seeds = 16 finetune_dt_scale = 0.95 - collision_activation_distance = 0.01 - parallel_finetune = True - if key in [ - "tabletop_neutral_start", - "merged_cubby_neutral_start", - "merged_cubby_task_oriented", - "cubby_neutral_start", - "cubby_neutral_goal", - "dresser_neutral_start", - "dresser_neutral_goal", - "tabletop_task_oriented", - ]: - collision_activation_distance = 0.005 - if key in ["dresser_neutral_goal"]: - trajopt_seeds = 24 # 24 - # tsteps = 36 - collision_activation_distance = 0.01 - # trajopt_seeds = 12 - scene_problems = problems[key] # [:10] + + scene_problems = problems[key][:] n_cubes = check_problems(scene_problems) - # print(n_cubes) - # continue mg, robot_cfg = load_curobo( n_cubes, enable_debug, @@ -316,6 +291,7 @@ def benchmark_mb( collision_activation_distance=collision_activation_distance, args=args, parallel_finetune=parallel_finetune, + ik_seeds=ik_seeds, ) m_list = [] i = 0 @@ -326,10 +302,10 @@ def benchmark_mb( continue plan_config = MotionGenPlanConfig( - max_attempts=100, + max_attempts=20, enable_graph_attempt=1, - disable_graph_attempt=20, - enable_finetune_trajopt=True, + disable_graph_attempt=10, + enable_finetune_trajopt=not args.no_finetune, partial_ik_opt=False, enable_graph=graph_mode or force_graph, timeout=60, @@ -344,9 +320,11 @@ def benchmark_mb( problem_name = "d_" + key + "_" + str(i) # reset planner - mg.reset(reset_seed=True) + mg.reset(reset_seed=False) if args.mesh: - world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_mesh_world() + world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_mesh_world( + merge_meshes=False + ) else: world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_obb_world() mg.world_coll_checker.clear_cache() @@ -355,7 +333,13 @@ def benchmark_mb( # run planner start_state = JointState.from_position(mg.tensor_args.to_device([q_start])) goal_pose = Pose.from_list(pose) - + if i == 1: + for _ in range(3): + result = mg.plan_single( + start_state, + goal_pose, + plan_config.clone(), + ) result = mg.plan_single( start_state, goal_pose, @@ -389,11 +373,9 @@ def benchmark_mb( dt = result.optimized_dt.item() if "trajopt_result" in result.debug_info: success = result.success.item() - traj_cost = ( - # result.debug_info["trajopt_result"].debug_info["solver"]["cost"][0] - result.debug_info["trajopt_result"].debug_info["solver"]["cost"][-1] - ) - # print(traj_cost[0]) + traj_cost = result.debug_info["trajopt_result"].debug_info["solver"][ + "cost" + ][-1] traj_cost = torch.cat(traj_cost, dim=-1) plot_cost_iteration( traj_cost, @@ -438,7 +420,7 @@ def benchmark_mb( .squeeze() .numpy() .tolist(), - "dt": interpolation_dt, + "dt": result.interpolation_dt, } debug = { @@ -728,11 +710,18 @@ if __name__ == "__main__": help="When True, runs benchmark with parameters for jetson", default=False, ) + parser.add_argument( + "--no_finetune", + action="store_true", + help="When True, runs benchmark with parameters for jetson", + default=False, + ) args = parser.parse_args() setup_curobo_logger("error") - for _ in range(1): + for i in range(1): + print("*****RUN: " + str(i)) benchmark_mb( save_log=False, write_usd=args.save_usd, diff --git a/benchmark/curobo_profile.py b/benchmark/curobo_profile.py index 77cbcb2..d6d98a7 100644 --- a/benchmark/curobo_profile.py +++ b/benchmark/curobo_profile.py @@ -75,12 +75,13 @@ def load_curobo( num_ik_seeds=30, num_trajopt_seeds=12, interpolation_dt=0.025, + finetune_trajopt_iters=200, # grad_trajopt_iters=200, store_ik_debug=enable_log, store_trajopt_debug=enable_log, ) mg = MotionGen(motion_gen_config) - mg.warmup(enable_graph=False) + mg.warmup(enable_graph=False, warmup_js_trajopt=False) return mg @@ -140,14 +141,17 @@ def benchmark_mb(args): print(result.total_time, result.solve_time) # continue # load obstacles - + # exit() # run planner with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + # torch.cuda.profiler.start() result = mg.plan_single( start_state, Pose.from_list(pose), plan_config, ) + # torch.cuda.profiler.stop() + print("Exporting the trace..") prof.export_chrome_trace(join_path(args.save_path, args.file_name) + ".json") print(result.success, result.status) @@ -191,5 +195,5 @@ if __name__ == "__main__": args = parser.parse_args() - setup_curobo_logger("error") + setup_curobo_logger("warn") benchmark_mb(args) diff --git a/benchmark/curobo_voxel_benchmark.py b/benchmark/curobo_voxel_benchmark.py new file mode 100644 index 0000000..bfccc0e --- /dev/null +++ b/benchmark/curobo_voxel_benchmark.py @@ -0,0 +1,660 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Standard Library +import argparse +from copy import deepcopy +from typing import Optional + +# Third Party +import matplotlib.pyplot as plt +import numpy as np +import torch +from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw +from tqdm import tqdm + +# CuRobo +from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig +from curobo.geom.types import Cuboid +from curobo.geom.types import Cuboid as curobo_Cuboid +from curobo.geom.types import Mesh, VoxelGrid +from curobo.types.base import TensorDeviceType +from curobo.types.camera import CameraObservation +from curobo.types.math import Pose +from curobo.types.robot import RobotConfig +from curobo.types.state import JointState +from curobo.util.logger import setup_curobo_logger +from curobo.util.metrics import CuroboGroupMetrics, CuroboMetrics +from curobo.util_file import ( + get_assets_path, + get_robot_configs_path, + get_world_configs_path, + join_path, + load_yaml, + write_yaml, +) +from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig +from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig + +torch.manual_seed(0) + +torch.backends.cudnn.benchmark = True + +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True + +np.random.seed(0) + + +def plot_cost_iteration(cost: torch.Tensor, save_path="cost", title="", log_scale=False): + fig = plt.figure(figsize=(5, 4)) + cost = cost.cpu().numpy() + # save to csv: + np.savetxt(save_path + ".csv", cost, delimiter=",") + + # if cost.shape[0] > 1: + colormap = plt.cm.winter + plt.gca().set_prop_cycle(plt.cycler("color", colormap(np.linspace(0, 1, cost.shape[0])))) + x = [i for i in range(cost.shape[-1])] + for i in range(cost.shape[0]): + plt.plot(x, cost[i], label="seed_" + str(i)) + plt.tight_layout() + # plt.title(title) + plt.xlabel("iteration") + plt.ylabel("cost") + if log_scale: + plt.yscale("log") + plt.grid() + # plt.legend() + plt.tight_layout() + plt.savefig(save_path + ".pdf") + plt.close() + + +def plot_traj(act_seq: JointState, dt=0.25, title="", save_path="plot.png", sma_filter=False): + fig, ax = plt.subplots(4, 1, figsize=(5, 8), sharex=True) + t_steps = np.linspace(0, act_seq.position.shape[0] * dt, act_seq.position.shape[0]) + + if sma_filter: + kernel = 5 + sma = torch.nn.AvgPool1d(kernel_size=kernel, stride=1, padding=2, ceil_mode=False).cuda() + + for i in range(act_seq.position.shape[-1]): + ax[0].plot(t_steps, act_seq.position[:, i].cpu(), "-", label=str(i)) + + ax[1].plot(t_steps[: act_seq.velocity.shape[0]], act_seq.velocity[:, i].cpu(), "-") + if sma_filter: + act_seq.acceleration[:, i] = sma( + act_seq.acceleration[:, i].view(1, -1) + ).squeeze() # @[1:-2] + + ax[2].plot(t_steps[: act_seq.acceleration.shape[0]], act_seq.acceleration[:, i].cpu(), "-") + if sma_filter: + act_seq.jerk[:, i] = sma(act_seq.jerk[:, i].view(1, -1)).squeeze() # @[1:-2]\ + + ax[3].plot(t_steps[: act_seq.jerk.shape[0]], act_seq.jerk[:, i].cpu(), "-") + ax[0].set_title(title + " dt=" + "{:.3f}".format(dt)) + ax[3].set_xlabel("Time(s)") + ax[3].set_ylabel("Jerk rad. s$^{-3}$") + ax[0].set_ylabel("Position rad.") + ax[1].set_ylabel("Velocity rad. s$^{-1}$") + ax[2].set_ylabel("Acceleration rad. s$^{-2}$") + ax[0].grid() + ax[1].grid() + ax[2].grid() + ax[3].grid() + ax[0].legend(bbox_to_anchor=(0.5, 1.6), loc="upper center", ncol=4) + plt.tight_layout() + plt.savefig(save_path) + plt.close() + + +def load_curobo( + n_cubes: int, + enable_debug: bool = False, + tsteps: int = 30, + trajopt_seeds: int = 4, + mpinets: bool = False, + graph_mode: bool = False, + cuda_graph: bool = True, + collision_activation_distance: float = 0.025, + finetune_dt_scale: float = 1.0, + parallel_finetune: bool = True, + args=None, +): + robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"] + robot_cfg["kinematics"]["collision_sphere_buffer"] = -0.00 + + ik_seeds = 24 + if graph_mode: + trajopt_seeds = 4 + if trajopt_seeds >= 14: + ik_seeds = max(100, trajopt_seeds * 2) + if mpinets: + robot_cfg["kinematics"]["lock_joints"] = { + "panda_finger_joint1": 0.025, + "panda_finger_joint2": 0.025, + } + world_cfg = WorldConfig.from_dict( + { + "voxel": { + "base": { + "dims": [2.4, 2.4, 2.4], + "pose": [0, 0, 0, 1, 0, 0, 0], + "voxel_size": 0.005, + "feature_dtype": torch.bfloat16, + }, + } + } + ) + interpolation_steps = 2000 + if graph_mode: + interpolation_steps = 100 + robot_cfg_instance = RobotConfig.from_dict(robot_cfg, tensor_args=TensorDeviceType()) + K = robot_cfg_instance.kinematics.kinematics_config.joint_limits + K.position[0, :] -= 0.2 + K.position[1, :] += 0.2 + + motion_gen_config = MotionGenConfig.load_from_robot_config( + robot_cfg_instance, + world_cfg, + trajopt_tsteps=tsteps, + collision_checker_type=CollisionCheckerType.VOXEL, + use_cuda_graph=cuda_graph, + position_threshold=0.005, # 0.5 cm + rotation_threshold=0.05, + num_ik_seeds=ik_seeds, + num_graph_seeds=trajopt_seeds, + num_trajopt_seeds=trajopt_seeds, + interpolation_dt=0.025, + store_ik_debug=enable_debug, + store_trajopt_debug=enable_debug, + interpolation_steps=interpolation_steps, + collision_activation_distance=collision_activation_distance, + trajopt_dt=0.25, + finetune_dt_scale=finetune_dt_scale, + maximum_trajectory_dt=0.15, + finetune_trajopt_iters=200, + ) + mg = MotionGen(motion_gen_config) + mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=True) + # create a ground truth collision checker: + world_model = WorldConfig.from_dict( + { + "cuboid": { + "table": { + "dims": [1, 1, 1], + "pose": [0, 0, 0, 1, 0, 0, 0], + } + } + } + ) + if args.mesh: + world_model = world_model.get_mesh_world() + config = RobotWorldConfig.load_from_config( + robot_cfg_instance, + world_model, + collision_activation_distance=0.0, + collision_checker_type=CollisionCheckerType.MESH, + n_cuboids=50, + n_meshes=50, + max_collision_distance=100.0, + ) + robot_world = RobotWorld(config) + + return mg, robot_cfg, robot_world + + +def benchmark_mb( + write_usd=False, + save_log=False, + write_plot=False, + write_benchmark=False, + plot_cost=False, + override_tsteps: Optional[int] = None, + args=None, +): + # load dataset: + graph_mode = args.graph + interpolation_dt = 0.02 + file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][1:] + + enable_debug = save_log or plot_cost + all_files = [] + og_tsteps = 32 + if override_tsteps is not None: + og_tsteps = override_tsteps + + og_trajopt_seeds = 4 + og_collision_activation_distance = 0.01 + if args.graph: + og_trajopt_seeds = 4 + for file_path in file_paths: + all_groups = [] + mpinets_data = False + problems = file_path() + if "dresser_task_oriented" in list(problems.keys()): + mpinets_data = True + for key, v in tqdm(problems.items()): + scene_problems = problems[key] + m_list = [] + i = -1 + ik_fail = 0 + trajopt_seeds = og_trajopt_seeds + tsteps = og_tsteps + collision_activation_distance = og_collision_activation_distance + finetune_dt_scale = 0.9 + parallel_finetune = True + if "cage_panda" in key: + trajopt_seeds = 8 + + if "table_under_pick_panda" in key: + trajopt_seeds = 8 + finetune_dt_scale = 0.98 + + if key == "cubby_task_oriented": + trajopt_seeds = 16 + finetune_dt_scale = 0.98 + + if "dresser_task_oriented" in key: + trajopt_seeds = 16 + finetune_dt_scale = 0.98 + + mg, robot_cfg, robot_world = load_curobo( + 0, + enable_debug, + tsteps, + trajopt_seeds, + mpinets_data, + graph_mode, + not args.disable_cuda_graph, + collision_activation_distance=collision_activation_distance, + finetune_dt_scale=finetune_dt_scale, + parallel_finetune=parallel_finetune, + args=args, + ) + for problem in tqdm(scene_problems, leave=False): + i += 1 + if problem["collision_buffer_ik"] < 0.0: + continue + + plan_config = MotionGenPlanConfig( + max_attempts=10, + enable_graph_attempt=1, + enable_finetune_trajopt=True, + partial_ik_opt=False, + enable_graph=graph_mode, + timeout=60, + enable_opt=not graph_mode, + parallel_finetune=True, + ) + + q_start = problem["start"] + pose = ( + problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"] + ) + + problem_name = key + "_" + str(i) + if args.mesh: + problem_name = "mesh_" + problem_name + # reset planner + mg.reset(reset_seed=False) + world = WorldConfig.from_dict(problem["obstacles"]) + + # mg.world_coll_checker.clear_cache() + world_coll = WorldConfig.from_dict(problem["obstacles"]) + if args.mesh: + world_coll = world_coll.get_mesh_world(merge_meshes=False) + robot_world.clear_world_cache() + robot_world.update_world(world_coll) + + esdf = robot_world.world_model.get_esdf_in_bounding_box( + Cuboid(name="base", pose=[0, 0, 0, 1, 0, 0, 0], dims=[2.4, 2.4, 2.4]), + voxel_size=0.005, + dtype=torch.float32, + ) + world_voxel_collision = mg.world_coll_checker + world_voxel_collision.update_voxel_data(esdf) + + torch.cuda.synchronize() + + start_state = JointState.from_position(mg.tensor_args.to_device([q_start])) + if i == 0: + for _ in range(3): + result = mg.plan_single( + start_state, + Pose.from_list(pose), + plan_config, + ) + result = mg.plan_single( + start_state, + Pose.from_list(pose), + plan_config, + ) + # if not result.success.item(): + # world = write_yaml(problem["obstacles"], "dresser_task.yml") + # exit() + + # print(result.total_time, result.ik_time, result.trajopt_time, result.finetune_time) + if result.status == "IK Fail": + ik_fail += 1 + problem["solution"] = None + if save_log or write_usd: + world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2]) + + coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box( + curobo_Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[2, 2, 2]), + voxel_size=0.005, + ) + + coll_mesh.color = [0.0, 0.8, 0.8, 0.2] + + coll_mesh.name = "voxel_world" + # world = WorldConfig(mesh=[coll_mesh]) + world.add_obstacle(coll_mesh) + # get costs: + if plot_cost: + dt = 0.5 + problem_name = "approx_wolfe_p" + problem_name + if result.optimized_dt is not None: + dt = result.optimized_dt.item() + if "trajopt_result" in result.debug_info: + success = result.success.item() + traj_cost = result.debug_info["trajopt_result"].debug_info["solver"][ + "cost" + ][-1] + traj_cost = torch.cat(traj_cost, dim=-1) + plot_cost_iteration( + traj_cost, + title=problem_name + "_" + str(success) + "_" + str(dt), + save_path=join_path("benchmark/log/plot/", problem_name + "_cost"), + log_scale=False, + ) + if "finetune_trajopt_result" in result.debug_info: + traj_cost = result.debug_info["finetune_trajopt_result"].debug_info[ + "solver" + ]["cost"][0] + traj_cost = torch.cat(traj_cost, dim=-1) + plot_cost_iteration( + traj_cost, + title=problem_name + "_" + str(success) + "_" + str(dt), + save_path=join_path( + "benchmark/log/plot/", problem_name + "_ft_cost" + ), + log_scale=False, + ) + if result.success.item(): + q_traj = result.get_interpolated_plan() + problem["goal_ik"] = q_traj.position.cpu().squeeze().numpy()[-1, :].tolist() + problem["solution"] = { + "position": result.get_interpolated_plan() + .position.cpu() + .squeeze() + .numpy() + .tolist(), + "velocity": result.get_interpolated_plan() + .velocity.cpu() + .squeeze() + .numpy() + .tolist(), + "acceleration": result.get_interpolated_plan() + .acceleration.cpu() + .squeeze() + .numpy() + .tolist(), + "jerk": result.get_interpolated_plan() + .jerk.cpu() + .squeeze() + .numpy() + .tolist(), + "dt": interpolation_dt, + } + debug = { + "used_graph": result.used_graph, + "attempts": result.attempts, + "ik_time": result.ik_time, + "graph_time": result.graph_time, + "trajopt_time": result.trajopt_time, + "total_time": result.total_time, + "solve_time": result.solve_time, + "opt_traj": { + "position": result.optimized_plan.position.cpu() + .squeeze() + .numpy() + .tolist(), + "velocity": result.optimized_plan.velocity.cpu() + .squeeze() + .numpy() + .tolist(), + "acceleration": result.optimized_plan.acceleration.cpu() + .squeeze() + .numpy() + .tolist(), + "jerk": result.optimized_plan.jerk.cpu().squeeze().numpy().tolist(), + "dt": result.optimized_dt.item(), + }, + "valid_query": result.valid_query, + } + problem["solution_debug"] = debug + + # check if path is collision free w.r.t. ground truth mesh: + # robot_world.world_model.clear_cache() + + q_int_traj = result.get_interpolated_plan().position.unsqueeze(0) + d_int_mask = ( + torch.count_nonzero(~robot_world.validate_trajectory(q_int_traj)) == 0 + ).item() + + q_traj = result.optimized_plan.position.unsqueeze(0) + d_mask = ( + torch.count_nonzero(~robot_world.validate_trajectory(q_traj)) == 0 + ).item() + # d_world, _ = robot_world.get_world_self_collision_distance_from_joint_trajectory( + # q_traj) + # thres_dist = robot_world.contact_distance + # in_collision = d_world.squeeze(0) > thres_dist + # d_mask = not torch.any(in_collision, dim=-1).item() + # if not d_mask: + # write_usd = True + # #print(torch.max(d_world).item(), problem_name) + current_metrics = CuroboMetrics( + skip=False, + success=True, + perception_success=d_mask, + perception_interpolated_success=d_int_mask, + time=result.total_time, + collision=False, + joint_limit_violation=False, + self_collision=False, + position_error=result.position_error.item() * 100.0, + orientation_error=result.rotation_error.item() * 100.0, + eef_position_path_length=10, + eef_orientation_path_length=10, + attempts=result.attempts, + motion_time=result.motion_time.item(), + solve_time=result.solve_time, + jerk=torch.max(torch.abs(result.optimized_plan.jerk)).item(), + ) + + # run planner + if write_usd: # and not d_int_mask: + # CuRobo + from curobo.util.usd_helper import UsdHelper + + q_traj = result.get_interpolated_plan() + UsdHelper.write_trajectory_animation_with_robot_usd( + robot_cfg, + world, + start_state, + q_traj, + dt=result.interpolation_dt, + save_path=join_path("benchmark/log/usd/", problem_name) + ".usd", + interpolation_steps=1, + write_robot_usd_path="benchmark/log/usd/assets/", + robot_usd_local_reference="assets/", + base_frame="/world_" + problem_name, + visualize_robot_spheres=True, + # flatten_usd=True, + ) + # write_usd = False + # exit() + if write_plot: + problem_name = problem_name + plot_traj( + result.optimized_plan, + result.optimized_dt.item(), + # result.get_interpolated_plan(), + # result.interpolation_dt, + title=problem_name, + save_path=join_path("benchmark/log/plot/", problem_name + ".pdf"), + ) + plot_traj( + # result.optimized_plan, + # result.optimized_dt.item(), + result.get_interpolated_plan(), + result.interpolation_dt, + title=problem_name, + save_path=join_path("benchmark/log/plot/", problem_name + "_int.pdf"), + ) + + m_list.append(current_metrics) + all_groups.append(current_metrics) + elif result.valid_query: + current_metrics = CuroboMetrics() + debug = { + "used_graph": result.used_graph, + "attempts": result.attempts, + "ik_time": result.ik_time, + "graph_time": result.graph_time, + "trajopt_time": result.trajopt_time, + "total_time": result.total_time, + "solve_time": result.solve_time, + "status": result.status, + "valid_query": result.valid_query, + } + problem["solution_debug"] = debug + + m_list.append(current_metrics) + all_groups.append(current_metrics) + else: + # print("invalid: " + problem_name) + debug = { + "used_graph": result.used_graph, + "attempts": result.attempts, + "ik_time": result.ik_time, + "graph_time": result.graph_time, + "trajopt_time": result.trajopt_time, + "total_time": result.total_time, + "solve_time": result.solve_time, + "status": result.status, + "valid_query": result.valid_query, + } + problem["solution_debug"] = debug + if save_log: # and not result.success.item(): + # CuRobo + from curobo.util.usd_helper import UsdHelper + + UsdHelper.write_motion_gen_log( + result, + robot_cfg, + world, + start_state, + Pose.from_list(pose), + join_path("benchmark/log/usd/", problem_name) + "_debug", + write_ik=True, + write_trajopt=True, + visualize_robot_spheres=True, + grid_space=2, + # flatten_usd=True, + ) + exit() + g_m = CuroboGroupMetrics.from_list(m_list) + print( + key, + f"{g_m.success:2.2f}", + g_m.time.mean, + # g_m.time.percent_75, + g_m.time.percent_98, + g_m.position_error.percent_98, + # g_m.position_error.median, + g_m.orientation_error.percent_98, + g_m.cspace_path_length.percent_98, + g_m.motion_time.percent_98, + g_m.perception_interpolated_success, + # g_m.orientation_error.median, + ) + print(g_m.attempts) + g_m = CuroboGroupMetrics.from_list(all_groups) + print( + "All: ", + f"{g_m.success:2.2f}", + g_m.motion_time.percent_98, + g_m.time.mean, + g_m.time.percent_75, + g_m.position_error.percent_75, + g_m.orientation_error.percent_75, + g_m.perception_success, + ) + print(g_m.attempts) + if write_benchmark: + if not mpinets_data: + write_yaml(problems, "mb_curobo_solution_voxel.yaml") + else: + write_yaml(problems, "mpinets_curobo_solution_voxel.yaml") + all_files += all_groups + g_m = CuroboGroupMetrics.from_list(all_files) + print("######## FULL SET ############") + print("All: ", f"{g_m.success:2.2f}") + print( + "Perception Success (coarse, interpolated):", + g_m.perception_success, + g_m.perception_interpolated_success, + ) + print("MT: ", g_m.motion_time) + print("PT:", g_m.time) + print("ST: ", g_m.solve_time) + print("accuracy: ", g_m.position_error, g_m.orientation_error) + print("Jerk: ", g_m.jerk) + + +if __name__ == "__main__": + setup_curobo_logger("error") + parser = argparse.ArgumentParser() + + parser.add_argument( + "--mesh", + action="store_true", + help="When True, runs only geometric planner", + default=False, + ) + + parser.add_argument( + "--graph", + action="store_true", + help="When True, runs only geometric planner", + default=False, + ) + + parser.add_argument( + "--disable_cuda_graph", + action="store_true", + help="When True, disable cuda graph during benchmarking", + default=False, + ) + args = parser.parse_args() + benchmark_mb( + save_log=False, + write_usd=False, + write_plot=False, + write_benchmark=False, + plot_cost=False, + args=args, + ) diff --git a/benchmark/curobo_voxel_profile.py b/benchmark/curobo_voxel_profile.py new file mode 100644 index 0000000..f706d51 --- /dev/null +++ b/benchmark/curobo_voxel_profile.py @@ -0,0 +1,305 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Standard Library +import argparse +from copy import deepcopy +from typing import Optional + +# Third Party +import numpy as np +import torch +from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw +from torch.profiler import ProfilerActivity, profile, record_function +from tqdm import tqdm + +# CuRobo +from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig +from curobo.geom.types import Cuboid +from curobo.geom.types import Cuboid as curobo_Cuboid +from curobo.geom.types import Mesh, VoxelGrid +from curobo.types.base import TensorDeviceType +from curobo.types.camera import CameraObservation +from curobo.types.math import Pose +from curobo.types.robot import RobotConfig +from curobo.types.state import JointState +from curobo.util.logger import setup_curobo_logger +from curobo.util.metrics import CuroboGroupMetrics, CuroboMetrics +from curobo.util_file import ( + get_assets_path, + get_robot_configs_path, + get_world_configs_path, + join_path, + load_yaml, + write_yaml, +) +from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig +from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig + +torch.manual_seed(0) + +torch.backends.cudnn.benchmark = True + +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True + +np.random.seed(0) + + +def load_curobo( + n_cubes: int, + enable_debug: bool = False, + tsteps: int = 30, + trajopt_seeds: int = 4, + mpinets: bool = False, + graph_mode: bool = False, + cuda_graph: bool = True, + collision_activation_distance: float = 0.025, + finetune_dt_scale: float = 1.0, + parallel_finetune: bool = True, +): + robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"] + robot_cfg["kinematics"]["collision_sphere_buffer"] = 0.0 + + ik_seeds = 30 + if graph_mode: + trajopt_seeds = 4 + if trajopt_seeds >= 14: + ik_seeds = max(100, trajopt_seeds * 2) + if mpinets: + robot_cfg["kinematics"]["lock_joints"] = { + "panda_finger_joint1": 0.025, + "panda_finger_joint2": 0.025, + } + world_cfg = WorldConfig.from_dict( + { + "voxel": { + "base": { + "dims": [2.0, 2.0, 3.0], + "pose": [0, 0, 0, 1, 0, 0, 0], + "voxel_size": 0.01, + "feature_dtype": torch.float8_e4m3fn, + }, + } + } + ) + interpolation_steps = 2000 + if graph_mode: + interpolation_steps = 100 + robot_cfg_instance = RobotConfig.from_dict(robot_cfg, tensor_args=TensorDeviceType()) + K = robot_cfg_instance.kinematics.kinematics_config.joint_limits + K.position[0, :] -= 0.2 + K.position[1, :] += 0.2 + + motion_gen_config = MotionGenConfig.load_from_robot_config( + robot_cfg_instance, + world_cfg, + trajopt_tsteps=tsteps, + collision_checker_type=CollisionCheckerType.VOXEL, + use_cuda_graph=cuda_graph, + position_threshold=0.005, # 0.5 cm + rotation_threshold=0.05, + num_ik_seeds=ik_seeds, + num_graph_seeds=trajopt_seeds, + num_trajopt_seeds=trajopt_seeds, + interpolation_dt=0.025, + store_ik_debug=enable_debug, + store_trajopt_debug=enable_debug, + interpolation_steps=interpolation_steps, + collision_activation_distance=collision_activation_distance, + trajopt_dt=0.25, + finetune_dt_scale=finetune_dt_scale, + maximum_trajectory_dt=0.16, + finetune_trajopt_iters=300, + ) + mg = MotionGen(motion_gen_config) + mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=True) + # create a ground truth collision checker: + config = RobotWorldConfig.load_from_config( + robot_cfg_instance, + "collision_table.yml", + collision_activation_distance=0.0, + collision_checker_type=CollisionCheckerType.PRIMITIVE, + n_cuboids=50, + ) + robot_world = RobotWorld(config) + + return mg, robot_cfg, robot_world + + +def benchmark_mb( + write_usd=False, + save_log=False, + write_plot=False, + write_benchmark=False, + plot_cost=False, + override_tsteps: Optional[int] = None, + args=None, +): + # load dataset: + graph_mode = args.graph + interpolation_dt = 0.02 + file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][:1] + + enable_debug = save_log or plot_cost + all_files = [] + og_tsteps = 32 + if override_tsteps is not None: + og_tsteps = override_tsteps + + og_trajopt_seeds = 12 + og_collision_activation_distance = 0.01 # 0.03 + if args.graph: + og_trajopt_seeds = 4 + for file_path in file_paths: + all_groups = [] + mpinets_data = False + problems = file_path() + + for key, v in tqdm(problems.items()): + scene_problems = problems[key] + m_list = [] + i = -1 + ik_fail = 0 + trajopt_seeds = og_trajopt_seeds + tsteps = og_tsteps + collision_activation_distance = og_collision_activation_distance + finetune_dt_scale = 1.0 + parallel_finetune = True + if "cage_panda" in key: + trajopt_seeds = 16 + finetune_dt_scale = 0.95 + parallel_finetune = True + if "table_under_pick_panda" in key: + tsteps = 36 + trajopt_seeds = 16 + finetune_dt_scale = 0.95 + parallel_finetune = True + # collision_activation_distance = 0.015 + + if "table_pick_panda" in key: + collision_activation_distance = 0.005 + + if "cubby_task_oriented" in key: # and "merged" not in key: + trajopt_seeds = 16 + finetune_dt_scale = 0.95 + collision_activation_distance = 0.005 + parallel_finetune = True + if "dresser_task_oriented" in key: + trajopt_seeds = 16 + finetune_dt_scale = 0.95 + parallel_finetune = True + if key in [ + "tabletop_neutral_start", + "merged_cubby_neutral_start", + "cubby_neutral_start", + "cubby_neutral_goal", + "dresser_neutral_start", + "tabletop_task_oriented", + ]: + collision_activation_distance = 0.005 + if "dresser_task_oriented" in list(problems.keys()): + mpinets_data = True + + mg, robot_cfg, robot_world = load_curobo( + 0, + enable_debug, + tsteps, + trajopt_seeds, + mpinets_data, + graph_mode, + not args.disable_cuda_graph, + collision_activation_distance=collision_activation_distance, + finetune_dt_scale=finetune_dt_scale, + parallel_finetune=parallel_finetune, + ) + for problem in tqdm(scene_problems, leave=False): + i += 1 + if problem["collision_buffer_ik"] < 0.0: + continue + + plan_config = MotionGenPlanConfig( + max_attempts=10, + enable_graph_attempt=1, + enable_finetune_trajopt=True, + partial_ik_opt=False, + enable_graph=graph_mode, + timeout=60, + enable_opt=not graph_mode, + parallel_finetune=True, + ) + + q_start = problem["start"] + pose = ( + problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"] + ) + + # reset planner + mg.reset(reset_seed=False) + world_coll = WorldConfig.from_dict(problem["obstacles"]).get_obb_world() + + robot_world.update_world(world_coll) + + esdf = robot_world.world_model.get_esdf_in_bounding_box( + Cuboid(name="base", pose=[0, 0, 0, 1, 0, 0, 0], dims=[2, 2, 3]), voxel_size=0.01 + ) + world_voxel_collision = mg.world_coll_checker + world_voxel_collision.update_voxel_data(esdf) + + start_state = JointState.from_position(mg.tensor_args.to_device([q_start])) + for _ in range(2): + result = mg.plan_single( + start_state, + Pose.from_list(pose), + plan_config, + ) + print("Profiling...") + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + torch.cuda.profiler.start() + + result = mg.plan_single( + start_state, + Pose.from_list(pose), + plan_config, + ) + torch.cuda.profiler.stop() + + print("Exporting the trace..") + prof.export_chrome_trace("benchmark/log/trace/motion_gen_voxel.json") + + exit() + + +if __name__ == "__main__": + setup_curobo_logger("error") + parser = argparse.ArgumentParser() + + parser.add_argument( + "--graph", + action="store_true", + help="When True, runs only geometric planner", + default=False, + ) + parser.add_argument( + "--disable_cuda_graph", + action="store_true", + help="When True, disable cuda graph during benchmarking", + default=False, + ) + args = parser.parse_args() + benchmark_mb( + save_log=False, + write_usd=False, + write_plot=False, + write_benchmark=False, + plot_cost=False, + args=args, + ) diff --git a/benchmark/ik_benchmark.py b/benchmark/ik_benchmark.py index 7c97f75..2eb75ba 100644 --- a/benchmark/ik_benchmark.py +++ b/benchmark/ik_benchmark.py @@ -59,21 +59,22 @@ def run_full_config_collision_free_ik( robot_cfg = RobotConfig.from_dict(robot_data) world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file))) position_threshold = 0.005 + grad_iters = None if high_precision: position_threshold = 0.001 + grad_iters = 100 ik_config = IKSolverConfig.load_from_robot_config( robot_cfg, world_cfg, - rotation_threshold=0.05, position_threshold=position_threshold, - num_seeds=24, + num_seeds=20, self_collision_check=collision_free, self_collision_opt=collision_free, tensor_args=tensor_args, use_cuda_graph=use_cuda_graph, high_precision=high_precision, regularization=False, - # grad_iters=500, + grad_iters=grad_iters, ) ik_solver = IKSolver(ik_config) @@ -140,7 +141,7 @@ if __name__ == "__main__": "Position-Error-Collision-Free-IK(mm)": [], "Orientation-Error-Collision-Free-IK": [], } - for robot_file in robot_list: + for robot_file in robot_list[:1]: # create a sampler with dof: for b_size in b_list: # sample test configs: diff --git a/docker/aarch64.dockerfile b/docker/aarch64.dockerfile index e1ee188..3ea0d5c 100644 --- a/docker/aarch64.dockerfile +++ b/docker/aarch64.dockerfile @@ -137,7 +137,7 @@ RUN pip3 install warp-lang # install curobo: -RUN mkdir /pkgs && git clone https://github.com/NVlabs/curobo.git +RUN mkdir /pkgs && cd /pkgs && git clone https://github.com/NVlabs/curobo.git ENV TORCH_CUDA_ARCH_LIST "7.0+PTX" @@ -166,7 +166,7 @@ RUN cd /pkgs && git clone https://github.com/nvlabs/nvblox_torch.git && \ python3 -m pip install -e . -RUN python -m pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git" +RUN python3 -m pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git" # upgrade typing extensions: RUN python3 -m pip install typing-extensions --upgrade diff --git a/docker/user.dockerfile b/docker/user.dockerfile index 50bc6dc..b402b83 100644 --- a/docker/user.dockerfile +++ b/docker/user.dockerfile @@ -15,12 +15,13 @@ FROM curobo_docker:${IMAGE_TAG} # Set variables ARG USERNAME ARG USER_ID -ARG CACHE_DATE=2024-02-20 +ARG CACHE_DATE=2024-03-18 # Set environment variables # Set up sudo user #RUN /sbin/adduser --disabled-password --gecos '' --uid $USER_ID $USERNAME +# RUN useradd -l -u 1000 $USERNAME RUN useradd -l -u $USER_ID -g users $USERNAME RUN /sbin/adduser $USERNAME sudo @@ -28,6 +29,7 @@ RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers # Set user +# RUN mkdir /home/$USERNAME && chown $USERNAME:$USERNAME /home/$USERNAME USER $USERNAME WORKDIR /home/$USERNAME ENV USER=$USERNAME diff --git a/examples/ik_example.py b/examples/ik_example.py index 7798a6f..3488100 100644 --- a/examples/ik_example.py +++ b/examples/ik_example.py @@ -31,7 +31,7 @@ torch.backends.cudnn.allow_tf32 = True def demo_basic_ik(): tensor_args = TensorDeviceType() - config_file = load_yaml(join_path(get_robot_configs_path(), "franka.yml")) + config_file = load_yaml(join_path(get_robot_configs_path(), "ur10e.yml")) urdf_file = config_file["robot_cfg"]["kinematics"][ "urdf_path" ] # Send global path starting with "/" @@ -48,13 +48,13 @@ def demo_basic_ik(): self_collision_check=False, self_collision_opt=False, tensor_args=tensor_args, - use_cuda_graph=False, + use_cuda_graph=True, ) ik_solver = IKSolver(ik_config) # print(kin_state) for _ in range(10): - q_sample = ik_solver.sample_configs(5000) + q_sample = ik_solver.sample_configs(100) kin_state = ik_solver.fk(q_sample) goal = Pose(kin_state.ee_position, kin_state.ee_quaternion) @@ -62,12 +62,12 @@ def demo_basic_ik(): result = ik_solver.solve_batch(goal) torch.cuda.synchronize() print( - "Success, Solve Time(s), Total Time(s) ", + "Success, Solve Time(s), hz ", torch.count_nonzero(result.success).item() / len(q_sample), result.solve_time, q_sample.shape[0] / (time.time() - st_time), - torch.mean(result.position_error) * 100.0, - torch.mean(result.rotation_error) * 100.0, + torch.mean(result.position_error), + torch.mean(result.rotation_error), ) @@ -97,7 +97,7 @@ def demo_full_config_collision_free_ik(): # print(kin_state) print("Running Single IK") for _ in range(10): - q_sample = ik_solver.sample_configs(5000) + q_sample = ik_solver.sample_configs(1) kin_state = ik_solver.fk(q_sample) goal = Pose(kin_state.ee_position, kin_state.ee_quaternion) @@ -107,7 +107,7 @@ def demo_full_config_collision_free_ik(): total_time = (time.time() - st_time) / q_sample.shape[0] print( "Success, Solve Time(s), Total Time(s)", - torch.count_nonzero(result.success).item() / len(q_sample), + torch.count_nonzero(result.success).item(), result.solve_time, total_time, 1.0 / total_time, diff --git a/examples/isaac_sim/helper.py b/examples/isaac_sim/helper.py index 20a1d2e..9586ed0 100644 --- a/examples/isaac_sim/helper.py +++ b/examples/isaac_sim/helper.py @@ -100,7 +100,6 @@ def add_robot_to_scene( import_config, dest_path, ) - # prim_path = omni.usd.get_stage_next_free_path( # my_world.scene.stage, str(my_world.scene.stage.GetDefaultPrim().GetPath()) + robot_path, False) # print(prim_path) @@ -112,13 +111,13 @@ def add_robot_to_scene( position=position, ) if ISAAC_SIM_23: - robot_p.set_solver_velocity_iteration_count(4) + robot_p.set_solver_velocity_iteration_count(0) robot_p.set_solver_position_iteration_count(44) my_world._physics_context.set_solver_type("PGS") robot = my_world.scene.add(robot_p) - + robot_path = robot.prim_path return robot, robot_path diff --git a/examples/kinematics_example.py b/examples/kinematics_example.py index 9b83829..8f2958a 100644 --- a/examples/kinematics_example.py +++ b/examples/kinematics_example.py @@ -39,7 +39,7 @@ def demo_basic_robot(): # compute forward kinematics: - q = torch.rand((10, kin_model.get_dof()), **vars(tensor_args)) + q = torch.rand((10, kin_model.get_dof()), **(tensor_args.as_torch_dict())) out = kin_model.get_state(q) # here is the kinematics state: # print(out) @@ -55,7 +55,7 @@ def demo_full_config_robot(): kin_model = CudaRobotModel(robot_cfg.kinematics) # compute forward kinematics: - q = torch.rand((10, kin_model.get_dof()), **vars(tensor_args)) + q = torch.rand((10, kin_model.get_dof()), **(tensor_args.as_torch_dict())) out = kin_model.get_state(q) # here is the kinematics state: # print(out) diff --git a/examples/mpc_example.py b/examples/mpc_example.py index fef9b8e..06bb92e 100644 --- a/examples/mpc_example.py +++ b/examples/mpc_example.py @@ -94,10 +94,10 @@ def demo_full_config_mpc(): while not converged: st_time = time.time() # current_state.position += 0.1 - print(current_state.position) + # print(current_state.position) result = mpc.step(current_state, 1) - print(mpc.get_visual_rollouts().shape) + # print(mpc.get_visual_rollouts().shape) # exit() torch.cuda.synchronize() if tstep > 5: diff --git a/examples/robot_image_segmentation_example.py b/examples/robot_image_segmentation_example.py index d98fcdb..52e82c7 100644 --- a/examples/robot_image_segmentation_example.py +++ b/examples/robot_image_segmentation_example.py @@ -32,7 +32,6 @@ from curobo.types.robot import RobotConfig from curobo.types.state import JointState from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml from curobo.wrap.model.robot_segmenter import RobotSegmenter -from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig torch.manual_seed(30) @@ -42,7 +41,13 @@ torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True -def create_render_dataset(robot_file, save_debug_data: bool = False): +def create_render_dataset( + robot_file, + save_debug_data: bool = False, + fov_deg: float = 60, + n_frames: int = 20, + retract_delta: float = 0.0, +): # load robot: robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file)) robot_dict["robot_cfg"]["kinematics"]["load_link_names_with_mesh"] = True @@ -54,6 +59,8 @@ def create_render_dataset(robot_file, save_debug_data: bool = False): q = kin_model.retract_config + q += retract_delta + meshes = kin_model.get_robot_as_mesh(q) world = WorldConfig(mesh=meshes[:]) @@ -71,10 +78,11 @@ def create_render_dataset(robot_file, save_debug_data: bool = False): mesh_dataset = MeshDataset( None, - n_frames=20, - image_size=480, + n_frames=n_frames, + image_size=640, save_data_dir=None, trimesh_mesh=robot_mesh, + fov_deg=fov_deg, ) q_js = JointState(position=q, joint_names=kin_model.joint_names) @@ -83,6 +91,7 @@ def create_render_dataset(robot_file, save_debug_data: bool = False): def mask_image(robot_file="ur5e.yml"): save_debug_data = False + write_pointcloud = False # create robot segmenter: tensor_args = TensorDeviceType() @@ -90,7 +99,7 @@ def mask_image(robot_file="ur5e.yml"): robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True ) - mesh_dataset, q_js = create_render_dataset(robot_file, save_debug_data) + mesh_dataset, q_js = create_render_dataset(robot_file, write_pointcloud, n_frames=20) if save_debug_data: visualize_scale = 10.0 @@ -113,14 +122,15 @@ def mask_image(robot_file="ur5e.yml"): # save robot spheres in current joint configuration robot_kinematics = curobo_segmenter._robot_world.kinematics - sph = robot_kinematics.get_robot_as_spheres(q_js.position) - WorldConfig(sphere=sph[0]).save_world_as_mesh("robot_spheres.stl") + if write_pointcloud: + sph = robot_kinematics.get_robot_as_spheres(q_js.position) + WorldConfig(sphere=sph[0]).save_world_as_mesh("robot_spheres.stl") - # save world pointcloud in robot origin + # save world pointcloud in robot origin - pc = cam_obs.get_pointcloud() - pc_obs = PointCloud("world", pose=cam_obs.pose.to_list(), points=pc) - pc_obs.save_as_mesh("camera_pointcloud.stl", transform_with_pose=True) + pc = cam_obs.get_pointcloud() + pc_obs = PointCloud("world", pose=cam_obs.pose.to_list(), points=pc) + pc_obs.save_as_mesh("camera_pointcloud.stl", transform_with_pose=True) # run segmentation: depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( @@ -131,10 +141,12 @@ def mask_image(robot_file="ur5e.yml"): robot_mask = cam_obs.clone() robot_mask.depth_image[~depth_mask] = 0.0 - robot_mesh = PointCloud( - "world", pose=robot_mask.pose.to_list(), points=robot_mask.get_pointcloud() - ) - robot_mesh.save_as_mesh("robot_segmented.stl", transform_with_pose=True) + + if write_pointcloud: + robot_mesh = PointCloud( + "world", pose=robot_mask.pose.to_list(), points=robot_mask.get_pointcloud() + ) + robot_mesh.save_as_mesh("robot_segmented.stl", transform_with_pose=True) # save depth image imageio.imwrite( "robot_depth.png", @@ -150,10 +162,11 @@ def mask_image(robot_file="ur5e.yml"): world_mask = cam_obs.clone() world_mask.depth_image[depth_mask] = 0.0 - world_mesh = PointCloud( - "world", pose=world_mask.pose.to_list(), points=world_mask.get_pointcloud() - ) - world_mesh.save_as_mesh("world_segmented.stl", transform_with_pose=True) + if write_pointcloud: + world_mesh = PointCloud( + "world", pose=world_mask.pose.to_list(), points=world_mask.get_pointcloud() + ) + world_mesh.save_as_mesh("world_segmented.stl", transform_with_pose=True) imageio.imwrite( "world_depth.png", @@ -190,6 +203,248 @@ def mask_image(robot_file="ur5e.yml"): print("Segmentation Time (ms), (hz)", np.mean(dt_list[5:]) * 1000.0, 1.0 / np.mean(dt_list[5:])) +def batch_mask_image(robot_file="ur5e.yml"): + """Mask images from different camera views using batched query. + + Note: This only works for a single joint configuration across camera views. + + Args: + robot_file: robot to use for example. + """ + save_debug_data = True + # create robot segmenter: + tensor_args = TensorDeviceType() + + curobo_segmenter = RobotSegmenter.from_robot_file( + robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True + ) + + mesh_dataset, q_js = create_render_dataset(robot_file, save_debug_data, fov_deg=60) + + mesh_dataset_zoom, q_js = create_render_dataset( + robot_file, save_debug_data, fov_deg=40, n_frames=30 + ) + + if save_debug_data: + visualize_scale = 10.0 + data = mesh_dataset[0] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + data_zoom = mesh_dataset_zoom[1] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + cam_obs_zoom = CameraObservation( + depth_image=tensor_args.to_device(data_zoom["depth"]) * 1000, + intrinsics=data_zoom["intrinsics"], + pose=Pose.from_matrix(data_zoom["pose"].to(device=tensor_args.device)), + ) + cam_obs = cam_obs.stack(cam_obs_zoom) + + for i in range(cam_obs.depth_image.shape[0]): + # save depth image + imageio.imwrite( + "camera_depth_" + str(i) + ".png", + (cam_obs.depth_image[i] * visualize_scale) + .squeeze() + .detach() + .cpu() + .numpy() + .astype(np.uint16), + ) + + # run segmentation: + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + # save robot points as mesh + + robot_mask = cam_obs.clone() + robot_mask.depth_image[~depth_mask] = 0.0 + for i in range(cam_obs.depth_image.shape[0]): + # save depth image + imageio.imwrite( + "robot_depth_" + str(i) + ".png", + (robot_mask.depth_image[i] * visualize_scale) + .detach() + .squeeze() + .cpu() + .numpy() + .astype(np.uint16), + ) + + # save world points as mesh + + imageio.imwrite( + "world_depth_" + str(i) + ".png", + (filtered_image[i] * visualize_scale) + .detach() + .squeeze() + .cpu() + .numpy() + .astype(np.uint16), + ) + + dt_list = [] + + for i in range(len(mesh_dataset)): + + data = mesh_dataset[i] + data_zoom = mesh_dataset_zoom[i + 1] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + cam_obs_zoom = CameraObservation( + depth_image=tensor_args.to_device(data_zoom["depth"]) * 1000, + intrinsics=data_zoom["intrinsics"], + pose=Pose.from_matrix(data_zoom["pose"].to(device=tensor_args.device)), + ) + cam_obs = cam_obs.stack(cam_obs_zoom) + if not curobo_segmenter.ready: + curobo_segmenter.update_camera_projection(cam_obs) + st_time = time.time() + + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + + torch.cuda.synchronize() + dt_list.append(time.time() - st_time) + + print("Segmentation Time (ms), (hz)", np.mean(dt_list[5:]) * 1000.0, 1.0 / np.mean(dt_list[5:])) + + +def batch_robot_mask_image(robot_file="ur5e.yml"): + """Mask images from different camera views using batched query. + + Note: This example treats each image to have different robot joint configuration + + Args: + robot_file: robot to use for example. + """ + save_debug_data = True + # create robot segmenter: + tensor_args = TensorDeviceType() + + curobo_segmenter = RobotSegmenter.from_robot_file( + robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True + ) + + mesh_dataset, q_js = create_render_dataset(robot_file, save_debug_data, fov_deg=60) + + mesh_dataset_zoom, q_js_zoom = create_render_dataset( + robot_file, save_debug_data, fov_deg=60, retract_delta=-0.5 + ) + q_js = q_js.unsqueeze(0) + q_js = q_js.stack(q_js_zoom.unsqueeze(0)) + + if save_debug_data: + visualize_scale = 10.0 + data = mesh_dataset[0] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + data_zoom = mesh_dataset_zoom[0] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + cam_obs_zoom = CameraObservation( + depth_image=tensor_args.to_device(data_zoom["depth"]) * 1000, + intrinsics=data_zoom["intrinsics"], + pose=Pose.from_matrix(data_zoom["pose"].to(device=tensor_args.device)), + ) + cam_obs = cam_obs.stack(cam_obs_zoom) + + for i in range(cam_obs.depth_image.shape[0]): + # save depth image + imageio.imwrite( + "camera_depth_" + str(i) + ".png", + (cam_obs.depth_image[i] * visualize_scale) + .squeeze() + .detach() + .cpu() + .numpy() + .astype(np.uint16), + ) + + # run segmentation: + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + # save robot points as mesh + + robot_mask = cam_obs.clone() + robot_mask.depth_image[~depth_mask] = 0.0 + for i in range(cam_obs.depth_image.shape[0]): + # save depth image + imageio.imwrite( + "robot_depth_" + str(i) + ".png", + (robot_mask.depth_image[i] * visualize_scale) + .detach() + .squeeze() + .cpu() + .numpy() + .astype(np.uint16), + ) + + # save world points as mesh + + imageio.imwrite( + "world_depth_" + str(i) + ".png", + (filtered_image[i] * visualize_scale) + .detach() + .squeeze() + .cpu() + .numpy() + .astype(np.uint16), + ) + + dt_list = [] + + for i in range(len(mesh_dataset)): + + data = mesh_dataset[i] + data_zoom = mesh_dataset_zoom[i] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + cam_obs_zoom = CameraObservation( + depth_image=tensor_args.to_device(data_zoom["depth"]) * 1000, + intrinsics=data_zoom["intrinsics"], + pose=Pose.from_matrix(data_zoom["pose"].to(device=tensor_args.device)), + ) + cam_obs = cam_obs.stack(cam_obs_zoom) + if not curobo_segmenter.ready: + curobo_segmenter.update_camera_projection(cam_obs) + st_time = time.time() + + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + + torch.cuda.synchronize() + dt_list.append(time.time() - st_time) + + print("Segmentation Time (ms), (hz)", np.mean(dt_list[5:]) * 1000.0, 1.0 / np.mean(dt_list[5:])) + + def profile_mask_image(robot_file="ur5e.yml"): # create robot segmenter: tensor_args = TensorDeviceType() @@ -236,3 +491,5 @@ def profile_mask_image(robot_file="ur5e.yml"): if __name__ == "__main__": mask_image() # profile_mask_image() + # batch_mask_image() + # batch_robot_mask_image() diff --git a/setup.cfg b/setup.cfg index d7bb59e..d5e1758 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,12 +41,12 @@ platforms = any [options] install_requires = + pybind11 networkx numpy numpy-quaternion pyyaml setuptools_scm>=6.2 - torchtyping torch>=1.10 trimesh yourdfpy>=0.0.53 @@ -55,7 +55,7 @@ install_requires = tqdm wheel importlib_resources - + scikit-image packages = find_namespace: package_dir = = src diff --git a/setup.py b/setup.py index c875560..11201a9 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ """curobo package setuptools.""" -# NOTE (roflaherty): This file is still needed to allow the package to be +# NOTE: This file is still needed to allow the package to be # installed in editable mode. # # References: @@ -21,12 +21,6 @@ import setuptools from torch.utils.cpp_extension import BuildExtension, CUDAExtension -print("*********************************") -print("CuRobo: Compiling CUDA kernels...") -print( - "This will take 20+ minutes, to speedup compilation set TORCH_CUDA_ARCH_LIST={X}+PTX to " - + " compile only for a specific architecture. e.g., export TORCH_CUDA_ARCH_LIST=7.0+PTX" -) extra_cuda_args = { "nvcc": [ "--threads=8", @@ -55,16 +49,6 @@ ext_modules = [ ], extra_compile_args=extra_cuda_args, ), - CUDAExtension( - "curobo.curobolib.geom_cu", - [ - "src/curobo/curobolib/cpp/geom_cuda.cpp", - "src/curobo/curobolib/cpp/sphere_obb_kernel.cu", - "src/curobo/curobolib/cpp/pose_distance_kernel.cu", - "src/curobo/curobolib/cpp/self_collision_kernel.cu", - ], - extra_compile_args=extra_cuda_args, - ), CUDAExtension( "curobo.curobolib.line_search_cu", [ @@ -82,6 +66,16 @@ ext_modules = [ ], extra_compile_args=extra_cuda_args, ), + CUDAExtension( + "curobo.curobolib.geom_cu", + [ + "src/curobo/curobolib/cpp/geom_cuda.cpp", + "src/curobo/curobolib/cpp/sphere_obb_kernel.cu", + "src/curobo/curobolib/cpp/pose_distance_kernel.cu", + "src/curobo/curobolib/cpp/self_collision_kernel.cu", + ], + extra_compile_args=extra_cuda_args, + ), ] setuptools.setup( diff --git a/src/curobo/__init__.py b/src/curobo/__init__.py index d56871c..5db6e50 100644 --- a/src/curobo/__init__.py +++ b/src/curobo/__init__.py @@ -56,8 +56,10 @@ def _get_version(): # `importlib_metadata` is the back ported library for older versions of python. # Third Party from importlib_metadata import version - - return version("nvidia_curobo") + try: + return version("nvidia_curobo") + except: + return "v0.7.0-no-tag" # Set `__version__` attribute diff --git a/src/curobo/content/assets/robot/franka_description/franka_test.urdf b/src/curobo/content/assets/robot/franka_description/franka_test.urdf index eb4115d..2b841b3 100644 --- a/src/curobo/content/assets/robot/franka_description/franka_test.urdf +++ b/src/curobo/content/assets/robot/franka_description/franka_test.urdf @@ -280,6 +280,7 @@ + diff --git a/src/curobo/content/configs/robot/franka.yml b/src/curobo/content/configs/robot/franka.yml index bc3f013..c555f34 100644 --- a/src/curobo/content/configs/robot/franka.yml +++ b/src/curobo/content/configs/robot/franka.yml @@ -13,7 +13,7 @@ robot_cfg: kinematics: use_usd_kinematics: False isaac_usd_path: "/Isaac/Robots/Franka/franka.usd" - usd_path: "robot/franka/franka_panda.usda" + usd_path: "robot/non_shipping/franka/franka_panda_meters.usda" usd_robot_root: "/panda" usd_flip_joints: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5", "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"] diff --git a/src/curobo/content/configs/robot/iiwa.yml b/src/curobo/content/configs/robot/iiwa.yml index 585541a..caf9e9d 100644 --- a/src/curobo/content/configs/robot/iiwa.yml +++ b/src/curobo/content/configs/robot/iiwa.yml @@ -39,7 +39,13 @@ robot_cfg: } urdf_path: robot/iiwa_allegro_description/iiwa.urdf asset_root_path: robot/iiwa_allegro_description - + mesh_link_names: + - iiwa7_link_1 + - iiwa7_link_2 + - iiwa7_link_3 + - iiwa7_link_4 + - iiwa7_link_5 + - iiwa7_link_6 cspace: joint_names: [ diff --git a/src/curobo/content/configs/robot/iiwa_allegro.yml b/src/curobo/content/configs/robot/iiwa_allegro.yml index 700f4f0..b8435fe 100644 --- a/src/curobo/content/configs/robot/iiwa_allegro.yml +++ b/src/curobo/content/configs/robot/iiwa_allegro.yml @@ -30,6 +30,25 @@ robot_cfg: - ring_link_3 - thumb_link_2 - thumb_link_3 + mesh_link_names: + - iiwa7_link_1 + - iiwa7_link_2 + - iiwa7_link_3 + - iiwa7_link_4 + - iiwa7_link_5 + - iiwa7_link_6 + - palm_link + - index_link_1 + - index_link_2 + - index_link_3 + - middle_link_1 + - middle_link_2 + - middle_link_3 + - ring_link_1 + - ring_link_2 + - ring_link_3 + - thumb_link_2 + - thumb_link_3 collision_sphere_buffer: 0.005 collision_spheres: spheres/iiwa_allegro.yml ee_link: palm_link diff --git a/src/curobo/content/configs/robot/spheres/franka_mesh.yml b/src/curobo/content/configs/robot/spheres/franka_mesh.yml index e4b578c..1dec717 100644 --- a/src/curobo/content/configs/robot/spheres/franka_mesh.yml +++ b/src/curobo/content/configs/robot/spheres/franka_mesh.yml @@ -134,11 +134,11 @@ collision_spheres: "radius": 0.022 panda_leftfinger: - "center": [0.0, 0.01, 0.043] - "radius": 0.011 # 25 + "radius": 0.011 #0.025 # 0.011 - "center": [0.0, 0.02, 0.015] - "radius": 0.011 # 25 + "radius": 0.011 #0.025 # 0.011 panda_rightfinger: - "center": [0.0, -0.01, 0.043] - "radius": 0.011 #25 + "radius": 0.011 #0.025 #0.011 - "center": [0.0, -0.02, 0.015] - "radius": 0.011 #25 \ No newline at end of file + "radius": 0.011 #0.025 #0.011 \ No newline at end of file diff --git a/src/curobo/content/configs/robot/spheres/ur10e.yml b/src/curobo/content/configs/robot/spheres/ur10e.yml index aaedf9b..19973e1 100644 --- a/src/curobo/content/configs/robot/spheres/ur10e.yml +++ b/src/curobo/content/configs/robot/spheres/ur10e.yml @@ -58,7 +58,7 @@ collision_spheres: radius: 0.07 tool0: - center: [0, 0, 0.12] - radius: 0.05 + radius: -0.01 camera_mount: - center: [0, 0.11, -0.01] radius: 0.06 \ No newline at end of file diff --git a/src/curobo/content/configs/robot/ur10e.yml b/src/curobo/content/configs/robot/ur10e.yml index 11fe3bb..c47b4a4 100644 --- a/src/curobo/content/configs/robot/ur10e.yml +++ b/src/curobo/content/configs/robot/ur10e.yml @@ -38,7 +38,7 @@ robot_cfg: 'wrist_1_link': 0, 'wrist_2_link': 0, 'wrist_3_link' : 0, - 'tool0': 0, + 'tool0': 0.05, } mesh_link_names: [ 'shoulder_link','upper_arm_link', 'forearm_link', 'wrist_1_link', 'wrist_2_link' ,'wrist_3_link' ] lock_joints: null diff --git a/src/curobo/content/configs/robot/ur5e.yml b/src/curobo/content/configs/robot/ur5e.yml index 689bd9a..3c17015 100644 --- a/src/curobo/content/configs/robot/ur5e.yml +++ b/src/curobo/content/configs/robot/ur5e.yml @@ -92,7 +92,7 @@ robot_cfg: "radius": 0.043 tool0: - "center": [0.001, 0.001, 0.05] - "radius": 0.05 + "radius": -0.01 #0.05 collision_sphere_buffer: 0.005 @@ -109,6 +109,7 @@ robot_cfg: 'wrist_1_link': 0, 'wrist_2_link': 0, 'wrist_3_link' : 0, + 'tool0': 0.025, } use_global_cumul: True diff --git a/src/curobo/content/configs/task/finetune_trajopt.yml b/src/curobo/content/configs/task/finetune_trajopt.yml index 467d86b..d692422 100644 --- a/src/curobo/content/configs/task/finetune_trajopt.yml +++ b/src/curobo/content/configs/task/finetune_trajopt.yml @@ -55,17 +55,17 @@ cost: bound_cfg: weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values - smooth_weight: [0.0,5000.0, 50.0, 0.0] # [vel, acc, jerk,] + smooth_weight: [0.0,5000.0, 50.0, 0.0] #[0.0,5000.0, 50.0, 0.0] # [vel, acc, jerk,] run_weight_velocity: 0.0 run_weight_acceleration: 1.0 run_weight_jerk: 1.0 - activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk + activation_distance: [0.1,0.1,0.1,0.1] # for position, velocity, acceleration and jerk null_space_weight: [0.0] primitive_collision_cfg: - weight: 1000000.0 + weight: 1000000.0 #1000000.0 1000000 use_sweep: True - sweep_steps: 6 + sweep_steps: 4 classify: False use_sweep_kernel: True use_speed_metric: True @@ -79,7 +79,7 @@ cost: lbfgs: - n_iters: 400 # 400 + n_iters: 300 # 400 inner_iters: 25 cold_start_n_iters: null min_iters: 25 @@ -89,7 +89,7 @@ lbfgs: cost_delta_threshold: 1.0 cost_relative_threshold: 0.999 #0.999 epsilon: 0.01 - history: 15 #15 + history: 27 #15 use_cuda_graph: True n_problems: 1 store_debug: False diff --git a/src/curobo/content/configs/task/gradient_ik.yml b/src/curobo/content/configs/task/gradient_ik.yml index 97e17b6..c81517b 100644 --- a/src/curobo/content/configs/task/gradient_ik.yml +++ b/src/curobo/content/configs/task/gradient_ik.yml @@ -47,7 +47,7 @@ cost: activation_distance: [0.1] null_space_weight: [1.0] primitive_collision_cfg: - weight: 50000.0 + weight: 5000.0 use_sweep: False classify: False activation_distance: 0.01 @@ -57,11 +57,11 @@ cost: lbfgs: - n_iters: 100 #60 - inner_iters: 25 + n_iters: 80 #60 + inner_iters: 20 cold_start_n_iters: null min_iters: 20 - line_search_scale: [0.01, 0.3, 0.7, 1.0] + line_search_scale: [0.1, 0.3, 0.7, 1.0] fixed_iters: True cost_convergence: 1e-7 cost_delta_threshold: 1e-6 #0.0001 diff --git a/src/curobo/content/configs/task/gradient_trajopt.yml b/src/curobo/content/configs/task/gradient_trajopt.yml index 471d186..02d3966 100644 --- a/src/curobo/content/configs/task/gradient_trajopt.yml +++ b/src/curobo/content/configs/task/gradient_trajopt.yml @@ -40,7 +40,7 @@ cost: link_pose_cfg: vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps - run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position + run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position weight: [2000,50000.0,30,50] #[150.0, 2000.0, 30, 40] vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation terminal: True @@ -54,19 +54,17 @@ cost: bound_cfg: weight: [5000.0, 50000.0, 50000.0,50000.0] # needs to be 3 values - #weight: [000.0, 000.0, 000.0,000.0] smooth_weight: [0.0,10000.0,10.0, 0.0] # [vel, acc, jerk, alpha_v-not-used] - #smooth_weight: [0.0,0000.0,0.0, 0.0] # [vel, acc, jerk, alpha_v-not-used] run_weight_velocity: 0.00 run_weight_acceleration: 1.0 run_weight_jerk: 1.0 - activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk + activation_distance: [0.1,0.1,0.1,0.1] # for position, velocity, acceleration and jerk null_space_weight: [0.0] primitive_collision_cfg: weight: 100000.0 use_sweep: True - sweep_steps: 6 + sweep_steps: 4 classify: False use_sweep_kernel: True use_speed_metric: True @@ -81,11 +79,11 @@ cost: lbfgs: - n_iters: 125 #175 + n_iters: 100 #175 inner_iters: 25 cold_start_n_iters: null min_iters: 25 - line_search_scale: [0.01,0.3,0.7,1.0] #[0.01,0.2, 0.3,0.5,0.7,0.9, 1.0] # + line_search_scale: [0.01,0.3,0.7,1.0] fixed_iters: True cost_convergence: 0.01 cost_delta_threshold: 2000.0 diff --git a/src/curobo/content/configs/task/graph.yml b/src/curobo/content/configs/task/graph.yml index 92ae0d1..56dbabd 100644 --- a/src/curobo/content/configs/task/graph.yml +++ b/src/curobo/content/configs/task/graph.yml @@ -33,7 +33,7 @@ graph: sample_pts: 1500 node_similarity_distance: 0.1 - rejection_ratio: 20 + rejection_ratio: 10 k_nn: 15 max_buffer: 10000 max_cg_buffer: 1000 diff --git a/src/curobo/content/configs/task/particle_ik.yml b/src/curobo/content/configs/task/particle_ik.yml index 903e549..414f873 100644 --- a/src/curobo/content/configs/task/particle_ik.yml +++ b/src/curobo/content/configs/task/particle_ik.yml @@ -34,7 +34,7 @@ cost: run_weight: 1.0 link_pose_cfg: vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - weight: [30, 50, 10, 10] #[20.0, 100.0] + weight: [0, 50, 10, 10] #[20.0, 100.0] vec_convergence: [0.00, 0.000] # orientation, position terminal: False use_metric: True @@ -67,7 +67,7 @@ mppi: cov_type : "DIAG_A" # kappa : 0.01 null_act_frac : 0.0 - sample_mode : 'BEST' + sample_mode : 'MEAN' base_action : 'REPEAT' squash_fn : 'CLAMP' n_problems : 1 diff --git a/src/curobo/content/configs/task/particle_mpc.yml b/src/curobo/content/configs/task/particle_mpc.yml index 2b5b7e8..d10544d 100644 --- a/src/curobo/content/configs/task/particle_mpc.yml +++ b/src/curobo/content/configs/task/particle_mpc.yml @@ -89,7 +89,7 @@ mppi: sample_mode : 'BEST' base_action : 'REPEAT' squash_fn : 'CLAMP' - n_problems : 1 + n_problems : 1 use_cuda_graph : True seed : 0 store_debug : False diff --git a/src/curobo/content/configs/task/particle_trajopt.yml b/src/curobo/content/configs/task/particle_trajopt.yml index 1c76623..23ae39a 100644 --- a/src/curobo/content/configs/task/particle_trajopt.yml +++ b/src/curobo/content/configs/task/particle_trajopt.yml @@ -39,7 +39,7 @@ cost: link_pose_cfg: vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position + run_vec_weight: [0.0,0.0,0.0,0.0,0.0,0.0] # running weight weight: [0.0, 5000.0, 40, 40] vec_convergence: [0.0,0.0,1000.0,1000.0] terminal: True @@ -63,11 +63,11 @@ cost: primitive_collision_cfg: weight: 5000.0 - use_sweep: True + use_sweep: False classify: False sweep_steps: 4 - use_sweep_kernel: True - use_speed_metric: True + use_sweep_kernel: False + use_speed_metric: False speed_dt: 0.01 # used only for speed metric activation_distance: 0.025 @@ -92,7 +92,7 @@ mppi: cov_type : "DIAG_A" # kappa : 0.001 null_act_frac : 0.0 - sample_mode : 'BEST' + sample_mode : 'MEAN' base_action : 'REPEAT' squash_fn : 'CLAMP' n_problems : 1 diff --git a/src/curobo/cuda_robot_model/cuda_robot_model.py b/src/curobo/cuda_robot_model/cuda_robot_model.py index 6f5a45a..016816b 100644 --- a/src/curobo/cuda_robot_model/cuda_robot_model.py +++ b/src/curobo/cuda_robot_model/cuda_robot_model.py @@ -180,7 +180,7 @@ class CudaRobotModel(CudaRobotModelConfig): self._batch_robot_spheres = torch.zeros( (self._batch_size, self.kinematics_config.total_spheres, 4), device=self.tensor_args.device, - dtype=self.tensor_args.dtype, + dtype=self.tensor_args.collision_geometry_dtype, ) self._grad_out_q = torch.zeros( (self._batch_size, self.get_dof()), diff --git a/src/curobo/cuda_robot_model/types.py b/src/curobo/cuda_robot_model/types.py index 9c782ad..776c240 100644 --- a/src/curobo/cuda_robot_model/types.py +++ b/src/curobo/cuda_robot_model/types.py @@ -23,6 +23,7 @@ from curobo.types.base import TensorDeviceType from curobo.types.math import Pose from curobo.types.state import JointState from curobo.types.tensor import T_DOF +from curobo.util.logger import log_error from curobo.util.tensor_util import clone_if_not_none, copy_if_not_none @@ -138,6 +139,13 @@ class CSpaceConfig: self.acceleration_scale = self.tensor_args.to_device(self.acceleration_scale) if isinstance(self.jerk_scale, List): self.jerk_scale = self.tensor_args.to_device(self.jerk_scale) + # check shapes: + if self.retract_config is not None: + dof = self.retract_config.shape + if self.cspace_distance_weight is not None and self.cspace_distance_weight.shape != dof: + log_error("cspace_distance_weight shape does not match retract_config") + if self.null_space_weight is not None and self.null_space_weight.shape != dof: + log_error("null_space_weight shape does not match retract_config") def inplace_reindex(self, joint_names: List[str]): new_index = [self.joint_names.index(j) for j in joint_names] @@ -207,8 +215,8 @@ class CSpaceConfig: ): retract_config = ((joint_position_upper + joint_position_lower) / 2).flatten() n_dof = retract_config.shape[-1] - null_space_weight = torch.ones(n_dof, **vars(tensor_args)) - cspace_distance_weight = torch.ones(n_dof, **vars(tensor_args)) + null_space_weight = torch.ones(n_dof, **(tensor_args.as_torch_dict())) + cspace_distance_weight = torch.ones(n_dof, **(tensor_args.as_torch_dict())) return CSpaceConfig( joint_names, retract_config, @@ -289,8 +297,8 @@ class KinematicsTensorConfig: retract_config = ( (self.joint_limits.position[1] + self.joint_limits.position[0]) / 2 ).flatten() - null_space_weight = torch.ones(self.n_dof, **vars(self.tensor_args)) - cspace_distance_weight = torch.ones(self.n_dof, **vars(self.tensor_args)) + null_space_weight = torch.ones(self.n_dof, **(self.tensor_args.as_torch_dict())) + cspace_distance_weight = torch.ones(self.n_dof, **(self.tensor_args.as_torch_dict())) joint_names = self.joint_names self.cspace = CSpaceConfig( joint_names, diff --git a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py index e116f68..f733798 100644 --- a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py +++ b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py @@ -175,7 +175,11 @@ class UrdfKinematicsParser(KinematicsParser): return txt def get_link_mesh(self, link_name): - m = self._robot.link_map[link_name].visuals[0].geometry.mesh + link_data = self._robot.link_map[link_name] + + if len(link_data.visuals) == 0: + log_error(link_name + " not found in urdf, remove from mesh_link_names") + m = link_data.visuals[0].geometry.mesh mesh_pose = self._robot.link_map[link_name].visuals[0].origin # read visual material: if mesh_pose is None: diff --git a/src/curobo/curobolib/cpp/geom_cuda.cpp b/src/curobo/curobolib/cpp/geom_cuda.cpp index 3775edb..6ea44ac 100644 --- a/src/curobo/curobolib/cpp/geom_cuda.cpp +++ b/src/curobo/curobolib/cpp/geom_cuda.cpp @@ -57,7 +57,8 @@ std::vectorswept_sphere_obb_clpt( const bool enable_speed_metric, const bool transform_back, const bool compute_distance, - const bool use_batch_env); + const bool use_batch_env, + const bool sum_collisions); std::vector sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4 @@ -66,6 +67,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4 torch::Tensor sparsity_idx, const torch::Tensor weight, const torch::Tensor activation_distance, + const torch::Tensor max_distance, const torch::Tensor obb_accel, // n_boxes, 4, 4 const torch::Tensor obb_bounds, // n_boxes, 3 const torch::Tensor obb_pose, // n_boxes, 4, 4 @@ -78,8 +80,52 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4 const int n_spheres, const bool transform_back, const bool compute_distance, - const bool use_batch_env); + const bool use_batch_env, + const bool sum_collisions, + const bool compute_esdf); +std::vector +sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3 + torch::Tensor distance, + torch::Tensor closest_point, // batch size, 3 + torch::Tensor sparsity_idx, const torch::Tensor weight, + const torch::Tensor activation_distance, + const torch::Tensor max_distance, + const torch::Tensor grid_features, // n_boxes, 4, 4 + const torch::Tensor grid_params, // n_boxes, 3 + const torch::Tensor grid_pose, // n_boxes, 4, 4 + const torch::Tensor grid_enable, // n_boxes, 4, 4 + const torch::Tensor n_env_grid, + const torch::Tensor env_query_idx, // n_boxes, 4, 4 + const int max_nobs, const int batch_size, const int horizon, + const int n_spheres, const bool transform_back, + const bool compute_distance, const bool use_batch_env, + const bool sum_collisions, + const bool compute_esdf); +std::vector +swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3 + torch::Tensor distance, + torch::Tensor closest_point, // batch size, 3 + torch::Tensor sparsity_idx, const torch::Tensor weight, + const torch::Tensor activation_distance, + const torch::Tensor max_distance, + const torch::Tensor speed_dt, + const torch::Tensor grid_features, // n_boxes, 4, 4 + const torch::Tensor grid_params, // n_boxes, 3 + const torch::Tensor grid_pose, // n_boxes, 4, 4 + const torch::Tensor grid_enable, // n_boxes, 4, 4 + const torch::Tensor n_env_grid, + const torch::Tensor env_query_idx, // n_boxes, 4, 4 + const int max_nobs, + const int batch_size, + const int horizon, + const int n_spheres, + const int sweep_steps, + const bool enable_speed_metric, + const bool transform_back, + const bool compute_distance, + const bool use_batch_env, + const bool sum_collisions); std::vectorpose_distance( torch::Tensor out_distance, torch::Tensor out_position_distance, @@ -159,11 +205,11 @@ std::vectorself_collision_distance_wrapper( std::vectorsphere_obb_clpt_wrapper( const torch::Tensor sphere_position, // batch_size, 4 - torch::Tensor distance, torch::Tensor closest_point, // batch size, 3 torch::Tensor sparsity_idx, const torch::Tensor weight, const torch::Tensor activation_distance, + const torch::Tensor max_distance, const torch::Tensor obb_accel, // n_boxes, 4, 4 const torch::Tensor obb_bounds, // n_boxes, 3 const torch::Tensor obb_pose, // n_boxes, 4, 4 @@ -171,8 +217,10 @@ std::vectorsphere_obb_clpt_wrapper( const torch::Tensor n_env_obb, // n_boxes, 4, 4 const torch::Tensor env_query_idx, // n_boxes, 4, 4 const int max_nobs, const int batch_size, const int horizon, - const int n_spheres, const bool transform_back, const bool compute_distance, - const bool use_batch_env) + const int n_spheres, + const bool transform_back, const bool compute_distance, + const bool use_batch_env, const bool sum_collisions = true, + const bool compute_esdf = false) { const at::cuda::OptionalCUDAGuard guard(sphere_position.device()); @@ -185,9 +233,9 @@ std::vectorsphere_obb_clpt_wrapper( CHECK_INPUT(obb_accel); return sphere_obb_clpt( sphere_position, distance, closest_point, sparsity_idx, weight, - activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable, + activation_distance, max_distance, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres, - transform_back, compute_distance, use_batch_env); + transform_back, compute_distance, use_batch_env, sum_collisions, compute_esdf); } std::vectorswept_sphere_obb_clpt_wrapper( @@ -205,7 +253,7 @@ std::vectorswept_sphere_obb_clpt_wrapper( const int max_nobs, const int batch_size, const int horizon, const int n_spheres, const int sweep_steps, const bool enable_speed_metric, const bool transform_back, const bool compute_distance, - const bool use_batch_env) + const bool use_batch_env, const bool sum_collisions = true) { const at::cuda::OptionalCUDAGuard guard(sphere_position.device()); @@ -218,7 +266,37 @@ std::vectorswept_sphere_obb_clpt_wrapper( distance, closest_point, sparsity_idx, weight, activation_distance, speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps, - enable_speed_metric, transform_back, compute_distance, use_batch_env); + enable_speed_metric, transform_back, compute_distance, use_batch_env, sum_collisions); +} + +std::vector +sphere_voxel_clpt_wrapper(const torch::Tensor sphere_position, // batch_size, 3 + torch::Tensor distance, + torch::Tensor closest_point, // batch size, 3 + torch::Tensor sparsity_idx, const torch::Tensor weight, + const torch::Tensor activation_distance, + const torch::Tensor max_distance, + const torch::Tensor grid_features, // n_boxes, 4, 4 + const torch::Tensor grid_params, // n_boxes, 3 + const torch::Tensor grid_pose, // n_boxes, 4, 4 + const torch::Tensor grid_enable, // n_boxes, 4, 4 + const torch::Tensor n_env_grid, + const torch::Tensor env_query_idx, // n_boxes, 4, 4 + const int max_ngrid, const int batch_size, const int horizon, + const int n_spheres, const bool transform_back, + const bool compute_distance, const bool use_batch_env, + const bool sum_collisions, + const bool compute_esdf) +{ + const at::cuda::OptionalCUDAGuard guard(sphere_position.device()); + + CHECK_INPUT(distance); + CHECK_INPUT(closest_point); + CHECK_INPUT(sphere_position); + return sphere_voxel_clpt(sphere_position, distance, closest_point, sparsity_idx, weight, + activation_distance, max_distance, grid_features, grid_params, + grid_pose, grid_enable, n_env_grid, env_query_idx, max_ngrid, batch_size, horizon, n_spheres, + transform_back, compute_distance, use_batch_env, sum_collisions, compute_esdf); } std::vectorpose_distance_wrapper( @@ -297,6 +375,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "Closest Point OBB(curobolib)"); m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper, "Swept Closest Point OBB(curobolib)"); + m.def("closest_point_voxel", &sphere_voxel_clpt_wrapper, + "Closest Point Voxel(curobolib)"); + m.def("swept_closest_point_voxel", &swept_sphere_voxel_clpt, + "Swpet Closest Point Voxel(curobolib)"); + + m.def("self_collision_distance", &self_collision_distance_wrapper, "Self Collision Distance (curobolib)"); diff --git a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp index b9bb8ea..ac8bbe5 100644 --- a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp +++ b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp @@ -14,38 +14,6 @@ #include -// CUDA forward declarations -std::vectorreduce_cuda(torch::Tensor vec, - torch::Tensor vec2, - torch::Tensor rho_buffer, - torch::Tensor sum, - const int batch_size, - const int v_dim, - const int m); - -std::vector -lbfgs_step_cuda(torch::Tensor step_vec, - torch::Tensor rho_buffer, - torch::Tensor y_buffer, - torch::Tensor s_buffer, - torch::Tensor grad_q, - const float epsilon, - const int batch_size, - const int m, - const int v_dim); - -std::vector -lbfgs_update_cuda(torch::Tensor rho_buffer, - torch::Tensor y_buffer, - torch::Tensor s_buffer, - torch::Tensor q, - torch::Tensor grad_q, - torch::Tensor x_0, - torch::Tensor grad_0, - const int batch_size, - const int m, - const int v_dim); - std::vector lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer, @@ -59,7 +27,8 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, const int batch_size, const int m, const int v_dim, - const bool stable_mode); + const bool stable_mode, + const bool use_shared_buffers); // C++ interface @@ -71,58 +40,12 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) -std::vector -lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer, - torch::Tensor y_buffer, torch::Tensor s_buffer, - torch::Tensor grad_q, const float epsilon, const int batch_size, - const int m, const int v_dim) -{ - CHECK_INPUT(step_vec); - CHECK_INPUT(rho_buffer); - CHECK_INPUT(y_buffer); - CHECK_INPUT(s_buffer); - CHECK_INPUT(grad_q); - const at::cuda::OptionalCUDAGuard guard(grad_q.device()); - - return lbfgs_step_cuda(step_vec, rho_buffer, y_buffer, s_buffer, grad_q, - epsilon, batch_size, m, v_dim); -} - -std::vector -lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer, - torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q, - torch::Tensor x_0, torch::Tensor grad_0, const int batch_size, - const int m, const int v_dim) -{ - CHECK_INPUT(rho_buffer); - CHECK_INPUT(y_buffer); - CHECK_INPUT(s_buffer); - CHECK_INPUT(grad_q); - CHECK_INPUT(x_0); - CHECK_INPUT(grad_0); - CHECK_INPUT(q); - const at::cuda::OptionalCUDAGuard guard(grad_q.device()); - - return lbfgs_update_cuda(rho_buffer, y_buffer, s_buffer, q, grad_q, x_0, - grad_0, batch_size, m, v_dim); -} - -std::vector -reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2, - torch::Tensor rho_buffer, torch::Tensor sum, - const int batch_size, const int v_dim, const int m) -{ - const at::cuda::OptionalCUDAGuard guard(sum.device()); - - return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m); -} - std::vector lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer, torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0, const float epsilon, const int batch_size, const int m, - const int v_dim, const bool stable_mode) + const int v_dim, const bool stable_mode, const bool use_shared_buffers) { CHECK_INPUT(step_vec); CHECK_INPUT(rho_buffer); @@ -136,13 +59,11 @@ lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer, return lbfgs_cuda_fuse(step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q, x_0, grad_0, epsilon, batch_size, m, v_dim, - stable_mode); + stable_mode, use_shared_buffers); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)"); - m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)"); + m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)"); - m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug"); } diff --git a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu index 1bad714..503aaba 100644 --- a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu +++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu @@ -12,7 +12,7 @@ #include #include #include - +#include #include #include @@ -32,200 +32,20 @@ // #include // #include -#define M_MAX 512 -#define HALF_MAX 65504.0 -#define M 15 -#define VDIM 175 // 25 * 7, - +//#define M_MAX 512 +//#define HALF_MAX 65504.0 +//#define M 15 +//#define VDIM 175 // 25 * 7, #define FULL_MASK 0xffffffff - +#define VOLTA_PLUS true namespace Curobo { namespace Optimization { - template - __device__ __forceinline__ void - scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out, - const int v_dim) - { - for (int i = 0; i < v_dim; i++) - { - out[i] = a * b[i]; - } - } - - template - __device__ __forceinline__ void - m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out, - const int v_dim, const int m) - { - for (int j = 0; j < m; j++) - { - for (int i = 0; i < v_dim; i++) - { - out[j * v_dim + i] = a[j] * b[j * v_dim + i]; - } - } - } - - template - __device__ __forceinline__ void vec_vec_dot(const scalar_t *a, - const scalar_t *b, scalar_t& out, - const int v_dim) - { - for (int i = 0; i < v_dim; i++) - { - out += a[i] * b[i]; - } - } - - template - __device__ __forceinline__ void update_r(const scalar_t *rho_y, - const scalar_t *s_buffer, scalar_t *r, - scalar_t& alpha, const int v_dim) - { - // dot product: and subtract with alpha - for (int i = 0; i < v_dim; i++) - { - alpha -= rho_y[i] * r[i]; - } - - // scalar vector product: - for (int i = 0; i < v_dim; i++) - { - r[i] = r[i] + alpha * s_buffer[i]; - } - } - - template - __device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq, - const scalar_t alpha, - const int v_dim) - { - // - for (int i = 0; i < v_dim; i++) - { - gq[i] = gq[i] - (alpha * y_buffer[i]); - } - } - - template - __global__ void - lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer, - const scalar_t *y_buffer, const scalar_t *s_buffer, - const scalar_t *grad_q, const float epsilon, - const int batchSize, const int m, const int v_dim) - { - // each thread writes one sphere of some link - const int t = blockDim.x * blockIdx.x + threadIdx.x; // batch - const int b_idx = t; - - if (t >= (batchSize)) - { - return; - } - - // get thread start address: - const int b_start_scalar_adrs = b_idx * m; - const int b_start_vec_adrs = b_idx * m * v_dim; - const int b_step_start_adrs = b_idx * v_dim; - - scalar_t rho_s[M * VDIM]; - - // copy floats to local buffer? - // y_buffer, s_buffer, rho_buffer - // compute rho_s - scalar_t loc_ybuf[M * VDIM]; - scalar_t loc_sbuf[M * VDIM]; - scalar_t loc_rho[M]; - scalar_t gq[VDIM]; // , l_q[VDIM]; - scalar_t alpha_buffer[M]; - scalar_t t_1, t_2; - - for (int i = 0; i < m * v_dim; i++) - { - loc_ybuf[i] = y_buffer[b_start_vec_adrs + i]; - loc_sbuf[i] = s_buffer[b_start_vec_adrs + i]; - } - - for (int i = 0; i < v_dim; i++) - { - gq[i] = grad_q[b_step_start_adrs + i]; - } - - for (int i = 0; i < m; i++) - { - loc_rho[i] = rho_buffer[b_start_scalar_adrs + i]; - } - - m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m); - - // for loop over m - for (int i = m - 1; i > m - 2; i--) - { - // l_start_vec_adrs = i * v_dim; - // scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim], - // v_dim); - vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim); - update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim); - } - - // compute initial hessian: - vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1, - v_dim); - vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2, - v_dim); - t_1 = t_1 / t_2; - - if (t_1 < 0) - { - t_1 = 0; - } - - t_1 += epsilon; - scalar_vec_product(t_1, &gq[0], &gq[0], v_dim); - - m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m); - - for (int i = 0; i < m; i++) - { - // scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim], - // v_dim); - update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i], - v_dim); - } - - // write gq to out grad: - - for (int i = 0; i < v_dim; i++) - { - step_vec[b_step_start_adrs + i] = -1.0 * gq[i]; - } - } - - template - __forceinline__ __device__ psum_t warpReduce(psum_t v, int elems, - unsigned mask) - { - psum_t val = v; - int shift = 1; - - for (int i = elems; i > 1; i /= 2) - { - val += __shfl_down_sync(mask, val, shift); - shift *= 2; - } - - // val += __shfl_down_sync(mask, val, 1); // i=32 - // val += __shfl_down_sync(mask, val, 2); // i=16 - // val += __shfl_down_sync(mask, val, 4); // i=8 - // val += __shfl_down_sync(mask, val, 8); // i=4 - // val += __shfl_down_sync(mask, val, 16); // i=2 - return val; - } - + + template - __forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data, + __forceinline__ __device__ void reduce_v0(scalar_t v, int m, psum_t *data, scalar_t *result) { psum_t val = v; @@ -238,7 +58,8 @@ namespace Curobo val += __shfl_down_sync(mask, val, 16); // int leader = __ffs(mask) – 1; // select a leader lane - int leader = 0; + const int leader = 0; + if (threadIdx.x % 32 == leader) { @@ -271,7 +92,6 @@ namespace Curobo } // int leader = __ffs(mask2) – 1; // select a leader lane - int leader = 0; if (threadIdx.x % 32 == leader) { @@ -280,8 +100,35 @@ namespace Curobo } } __syncthreads(); + } + + + template__inline__ __device__ scalar_t relu(scalar_t var) + { + if (var < 0) + return 0; + else + return var; + } + + template + __forceinline__ __device__ psum_t warpReduce(psum_t v, + const int elems, + unsigned mask) + { + psum_t val = v; + int shift = 1; + + #pragma unroll + for (int i = elems; i > 1; i /= 2) + { + val += __shfl_down_sync(mask, val, shift); + shift *= 2; + } + return val; + } // blockReduce template __forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data, @@ -292,12 +139,23 @@ namespace Curobo // int leader = __ffs(mask) – 1; // select a leader lane int leader = 0; - + if (threadIdx.x % 32 == leader) + { + if (m < 32) + { + result[0] = (scalar_t)val; + } + else + { + data[(threadIdx.x + 1) / 32] = val; + } + } + /* if (threadIdx.x % 32 == leader) { data[(threadIdx.x + 1) / 32] = val; } - + */ if (m >= 32) { __syncthreads(); @@ -307,10 +165,20 @@ namespace Curobo if (threadIdx.x / 32 == 0) // only the first warp will do this work { - psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2); + psum_t val2 = data[threadIdx.x % 32]; + int shift = 1; + + #pragma unroll + for (int i = elems - 1; i > 0; i /= 2) + { + val2 += __shfl_down_sync(mask2, val2, shift); + shift *= 2; + } + + //psum_t val2 = warpReduce(data[threadIdx.x % 32], elems - 1, mask2); // // int leader = __ffs(mask2) – 1; // select a leader lane - if (threadIdx.x == leader) + if (threadIdx.x % 32 == leader) { result[0] = (scalar_t)val2; } @@ -325,205 +193,7 @@ namespace Curobo } __syncthreads(); } - - template - __inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2, - const int m, psum_t *data, scalar_t *result) - { - scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x]; - - reduce(val, m, data, result); - } - - template__inline__ __device__ scalar_t relu(scalar_t var) - { - if (var < 0) - return 0; - else - return var; - } - - ////////////////////////////////////////////////////////// - // one block per batch - // v_dim threads per block - ////////////////////////////////////////////////////////// - template - __global__ void lbfgs_step_kernel(scalar_t *step_vec, // b x 175 - scalar_t *rho_buffer, // m x b x 1 - const scalar_t *y_buffer, // m x b x 175 - const scalar_t *s_buffer, // m x b x 175 - const scalar_t *grad_q, // b x 175 - const float epsilon, const int batchsize, - const int m, const int v_dim) - { - __shared__ psum_t - data[32]; // temporary buffer needed for block-wide reduction - __shared__ scalar_t - result; // result of the reduction or vector-vector dot product - __shared__ scalar_t gq[175]; /// gq = batch * v_dim - - assert(v_dim < 176); - int batch = blockIdx.x; // one block per batch - - if (threadIdx.x >= v_dim) - return; - - gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq - - scalar_t alpha_buffer[16]; - - // assert(m<16); // allocating a buffer assuming m < 16 - - for (int i = m - 1; i > -1; i--) - { - dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim, - &data[0], &result); - alpha_buffer[i] = result * rho_buffer[i * batchsize + batch]; - gq[threadIdx.x] = - gq[threadIdx.x] - - alpha_buffer[i] * - y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; - } - - // compute var1 - scalar_t val1 = - y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x]; - scalar_t val2 = - s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x]; - reduce(val1 * val1, v_dim, data, &result); - scalar_t denominator = result; - reduce(val1 * val2, v_dim, data, &result); - scalar_t numerator = result; - scalar_t var1 = numerator / denominator; - - scalar_t gamma = relu(var1) + epsilon; // epsilon - - gq[threadIdx.x] = gamma * gq[threadIdx.x]; - - for (int i = 0; i < m; i++) - { - dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim, - &data[0], &result); - gq[threadIdx.x] = - gq[threadIdx.x] + - (alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) * - s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; - } - - step_vec[batch * v_dim + threadIdx.x] = - -1 * gq[threadIdx.x]; // copy from shared memory to global memory - } - - template - __global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer, // m x b x 1 - scalar_t *y_buffer, // m x b x 175 - scalar_t *s_buffer, // m x b x 175 - scalar_t *q, // b x 175 - scalar_t *x_0, // b x 175 - scalar_t *grad_0, // b x 175 - const scalar_t *grad_q, // b x 175 - const int batchsize, const int m, - const int v_dim) - { - __shared__ psum_t - data[32]; // temporary buffer needed for block-wide reduction - __shared__ scalar_t - result; // result of the reduction or vector-vector dot product - - // __shared__ scalar_t y[175]; // temporary shared memory storage - // __shared__ scalar_t s[175]; // temporary shared memory storage - assert(v_dim <= VDIM); - int batch = blockIdx.x; // one block per batch - - if (threadIdx.x >= v_dim) - return; - - scalar_t y = - grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x]; - scalar_t s = - q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x]; - reduce(y * s, v_dim, &data[0], &result); - - for (int i = 1; i < m; i++) - { - s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = - s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; - y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = - y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; - } - - // __syncthreads(); - - s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s; - y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y; - grad_0[batch * v_dim + - threadIdx.x] = - grad_q[batch * v_dim + threadIdx.x]; - x_0[batch * v_dim + - threadIdx.x] = - q[batch * v_dim + threadIdx.x]; - - if (threadIdx.x == 0) - { - scalar_t rho = 1 / result; - - for (int i = 1; i < m; i++) - { - rho_buffer[(i - 1) * batchsize + batch] = - rho_buffer[i * batchsize + batch]; - } - rho_buffer[(m - 1) * batchsize + batch] = rho; - } - } - - template - __global__ void reduce_kernel( - scalar_t *vec1, // b x 175 - scalar_t *vec2, // b x 175 - scalar_t *rho_buffer, // m x b x 1 - - scalar_t *sum_out, // m x b x 1 - - const int batchsize, const int m, - const int v_dim) // s_buffer and y_buffer are not rolled by default - { - __shared__ psum_t - data[32]; // temporary buffer needed for block-wide reduction - __shared__ scalar_t - result; // result of the reduction or vector-vector dot product - int batch = blockIdx.x; // one block per batch - - if (threadIdx.x >= v_dim) - return; - - //////////////////// - // update_buffer - //////////////////// - scalar_t y = vec1[batch * v_dim + threadIdx.x]; - scalar_t s = vec2[batch * v_dim + threadIdx.x]; - - reduce(y * s, v_dim, &data[0], &result); - scalar_t numerator = result; - - if (threadIdx.x == 0) - { - sum_out[batch] = 1 / numerator; - } - - // return; - if (threadIdx.x < m - 1) - { - // m thread participate to shif the values - // this is safe as m<32 and this happens in lockstep - rho_buffer[threadIdx.x * batchsize + batch] = - rho_buffer[(threadIdx.x + 1) * batchsize + batch]; - } - else if (threadIdx.x == m - 1) - { - scalar_t rho = (1 / numerator); - rho_buffer[threadIdx.x * batchsize + batch] = rho; - } - } + template __global__ void lbfgs_update_buffer_and_step_v1( @@ -535,26 +205,24 @@ namespace Curobo scalar_t *x_0, // b x 175 scalar_t *grad_0, // b x 175 const scalar_t *grad_q, // b x 175 - const float epsilon, const int batchsize, const int m, const int v_dim, - const bool stable_mode = - false) // s_buffer and y_buffer are not rolled by default + const float epsilon, const int batchsize, const int lbfgs_history, const int v_dim, + const bool stable_mode = false) // s_buffer and y_buffer are not rolled by default { - // extern __shared__ scalar_t alpha_buffer_sh[]; - extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[]; - scalar_t *my_smem_rc = reinterpret_cast(my_smem); - scalar_t *alpha_buffer_sh = &my_smem_rc[0]; // m*blockDim.x - scalar_t *rho_buffer_sh = &my_smem_rc[m * blockDim.x]; // batchsize*m - scalar_t *s_buffer_sh = - &my_smem_rc[m * blockDim.x + m * batchsize]; // m*blockDim.x - scalar_t *y_buffer_sh = - &my_smem_rc[2 * m * blockDim.x + m * batchsize]; // m*blockDim.x + extern __shared__ float my_smem_rc[]; + //__shared__ float my_smem_rc[21 * (3 * (32 * 7) + 1)]; + int history_m = lbfgs_history; + + // align the external shared memory by 4 bytes + float* s_buffer_sh = (float *) &my_smem_rc; // m*blockDim.x + + float* y_buffer_sh = (float *) &s_buffer_sh[history_m * v_dim]; // m*blockDim.x + float* alpha_buffer_sh = (float *) &y_buffer_sh[history_m * v_dim]; // m*blockDim.x + float* rho_buffer_sh = (float *) &alpha_buffer_sh[history_m * v_dim]; // m*blockDim.x + + psum_t* data = (psum_t *)&rho_buffer_sh[history_m]; + float* result = (float *)&data[32]; + - __shared__ psum_t - data[32]; // temporary buffer needed for - // block-wide reduction - __shared__ scalar_t - result; // result of the reduction or - // vector-vector dot product int batch = blockIdx.x; // one block per batch if (threadIdx.x >= v_dim) @@ -562,7 +230,6 @@ namespace Curobo scalar_t gq; gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq - //////////////////// // update_buffer //////////////////// @@ -571,18 +238,17 @@ namespace Curobo // if y is close to zero scalar_t s = q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x]; - reduce_v1(y * s, v_dim, &data[0], &result); - // reduce(y * s, v_dim, &data[0], &result); - scalar_t numerator = result; + //reduce_v1(y * s, v_dim, &data[0], &result); + reduce_v1(y * s, v_dim, &data[0], result); + - // scalar_t rho = 1.0/numerator; + scalar_t numerator = result[0]; if (!rolled_ys) { -#pragma unroll - - for (int i = 1; i < m; i++) + #pragma unroll + for (int i = 1; i < history_m; i++) { scalar_t st = s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; @@ -590,30 +256,30 @@ namespace Curobo y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st; y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt; - s_buffer_sh[m * threadIdx.x + i - 1] = st; - y_buffer_sh[m * threadIdx.x + i - 1] = yt; + s_buffer_sh[history_m * threadIdx.x + i - 1] = st; + y_buffer_sh[history_m * threadIdx.x + i - 1] = yt; } } - s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s; - y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y; - s_buffer_sh[m * threadIdx.x + m - 1] = s; - y_buffer_sh[m * threadIdx.x + m - 1] = y; + s_buffer[(history_m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s; + y_buffer[(history_m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y; + s_buffer_sh[history_m * threadIdx.x + history_m - 1] = s; + y_buffer_sh[history_m * threadIdx.x + history_m - 1] = y; grad_0[batch * v_dim + threadIdx.x] = gq; x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x]; - if (threadIdx.x < m - 1) + if (threadIdx.x < history_m - 1) { // m thread participate to shif the values // this is safe as m<32 and this happens in lockstep scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch]; rho_buffer[threadIdx.x * batchsize + batch] = rho; - rho_buffer_sh[threadIdx.x * batchsize + batch] = rho; + rho_buffer_sh[threadIdx.x] = rho; } - if (threadIdx.x == m - 1) + if (threadIdx.x == history_m - 1) { scalar_t rho = 1.0 / numerator; @@ -623,10 +289,9 @@ namespace Curobo rho = 0.0; } rho_buffer[threadIdx.x * batchsize + batch] = rho; - rho_buffer_sh[threadIdx.x * batchsize + batch] = rho; + rho_buffer_sh[threadIdx.x] = rho; } - // return; __syncthreads(); //////////////////// @@ -635,25 +300,29 @@ namespace Curobo // scalar_t alpha_buffer[16]; // assert(m<16); // allocating a buffer assuming m < 16 -#pragma unroll - - for (int i = m - 1; i > -1; i--) + #pragma unroll + for (int i = history_m - 1; i > -1; i--) { // reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x], // v_dim, &data[0], &result); - reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result); - alpha_buffer_sh[threadIdx.x * m + i] = - result * rho_buffer_sh[i * batchsize + batch]; + //reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result); + reduce_v1(gq * s_buffer_sh[history_m * threadIdx.x + i], v_dim, &data[0], result); + + alpha_buffer_sh[threadIdx.x * history_m + i] = + result[0] * rho_buffer_sh[i]; // gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim + // batch*v_dim + threadIdx.x]; - gq = gq - alpha_buffer_sh[threadIdx.x * m + i] * - y_buffer_sh[m * threadIdx.x + i]; + gq = gq - alpha_buffer_sh[threadIdx.x * history_m + i] * + y_buffer_sh[history_m * threadIdx.x + i]; } + //return; // compute var1 - reduce_v1(y * y, v_dim, data, &result); - scalar_t denominator = result; + //reduce_v1(y * y, v_dim, &data[0], &result); + reduce_v1(y * y, v_dim, &data[0], result); + + scalar_t denominator = result[0]; // reduce(s*y, v_dim, data, &result); // redundant - already computed it above // scalar_t numerator = result; @@ -670,50 +339,187 @@ namespace Curobo scalar_t gamma = relu(var1); gq = gamma * gq; -#pragma unroll - - for (int i = 0; i < m; i++) + #pragma unroll + for (int i = 0; i < history_m; i++) { // reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x], // v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] - // result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim + // batch*v_dim + threadIdx.x]; - reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result); - gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] - - result * rho_buffer_sh[i * batchsize + batch]) * - s_buffer_sh[m * threadIdx.x + i]; + //reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result); + reduce_v1(gq * y_buffer_sh[history_m * threadIdx.x + i], v_dim, &data[0], result); + + gq = gq + (alpha_buffer_sh[threadIdx.x * history_m + i] - + result[0] * rho_buffer_sh[i]) * + s_buffer_sh[history_m * threadIdx.x + i]; } step_vec[batch * v_dim + threadIdx.x] = -1.0 * gq; // copy from shared memory to global memory } - // (32/M) rolls per warp - // Threads in a warp in a GPU execute in a lock-step. We leverage that to do - // the roll without using temporary storage or explicit synchronization. - template - __global__ void lbfgs_roll(scalar_t *a, // m x b x 175 - scalar_t *b, // m x b x 175 - const int m_t, const int batchsize, const int m, - const int v_dim) + template + __global__ void lbfgs_update_buffer_and_step_v1_compile_m( + scalar_t *step_vec, // b x 175 + scalar_t *rho_buffer, // m x b x 1 + scalar_t *y_buffer, // m x b x 175 + scalar_t *s_buffer, // m x b x 175 + scalar_t *q, // b x 175 + scalar_t *x_0, // b x 175 + scalar_t *grad_0, // b x 175 + const scalar_t *grad_q, // b x 175 + const float epsilon, const int batchsize, const int lbfgs_history, const int v_dim, + const bool stable_mode = false) // s_buffer and y_buffer are not rolled by default { - assert(m_t <= 32); - int t = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ float my_smem_rc[]; + //__shared__ float my_smem_rc[21 * (3 * (32 * 7) + 1)]; + + // align the external shared memory by 4 bytes + float* s_buffer_sh = (float *) &my_smem_rc; // m*blockDim.x - if (t >= m_t * v_dim * batchsize) + float* y_buffer_sh = (float *) &s_buffer_sh[FIXED_M * v_dim]; // m*blockDim.x + float* alpha_buffer_sh = (float *) &y_buffer_sh[FIXED_M * v_dim]; // m*blockDim.x + float* rho_buffer_sh = (float *) &alpha_buffer_sh[FIXED_M * v_dim]; // m*blockDim.x + + psum_t* data = (psum_t *)&rho_buffer_sh[FIXED_M]; + float* result = (float *)&data[32]; + + + int batch = blockIdx.x; // one block per batch + + if (threadIdx.x >= v_dim) return; - int _m = t % m_t; - int _v_dim = (t / m_t) % v_dim; - int batch = t / (m * v_dim); // this line could be wrong? + scalar_t gq; + gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq + //////////////////// + // update_buffer + //////////////////// + scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x]; - if (_m < m - 1) + // if y is close to zero + scalar_t s = + q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x]; + + //reduce_v1(y * s, v_dim, &data[0], &result); + reduce_v1(y * s, v_dim, &data[0], result); + + + scalar_t numerator = result[0]; + + if (!rolled_ys) { - a[_m * batchsize * v_dim + batch * v_dim + _v_dim] = - a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim]; - b[_m * batchsize * v_dim + batch * v_dim + _v_dim] = - b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim]; + scalar_t st = 0; + scalar_t yt = 0; + #pragma unroll + for (int i = 1; i < FIXED_M ; i++) + { + st = + s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; + yt = + y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; + s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st; + y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt; + s_buffer_sh[FIXED_M * threadIdx.x + i - 1] = st; + y_buffer_sh[FIXED_M * threadIdx.x + i - 1] = yt; + } } + + s_buffer[(FIXED_M - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s; + y_buffer[(FIXED_M - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y; + s_buffer_sh[FIXED_M * threadIdx.x + FIXED_M - 1] = s; + y_buffer_sh[FIXED_M * threadIdx.x + FIXED_M - 1] = y; + grad_0[batch * v_dim + threadIdx.x] = gq; + x_0[batch * v_dim + + threadIdx.x] = + q[batch * v_dim + threadIdx.x]; + + if (threadIdx.x < FIXED_M - 1) + { + // m thread participate to shif the values + // this is safe as m<32 and this happens in lockstep + scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch]; + rho_buffer[threadIdx.x * batchsize + batch] = rho; + rho_buffer_sh[threadIdx.x] = rho; + } + + if (threadIdx.x == FIXED_M - 1) + { + scalar_t rho = 1.0 / numerator; + + // if this is nan, make it zero: + if (stable_mode && (numerator == 0.0)) + { + rho = 0.0; + } + rho_buffer[threadIdx.x * batchsize + batch] = rho; + rho_buffer_sh[threadIdx.x] = rho; + } + + __syncthreads(); + + //////////////////// + // step + //////////////////// + // scalar_t alpha_buffer[16]; + // assert(m<16); // allocating a buffer assuming m < 16 + + #pragma unroll + for (int i = FIXED_M - 1; i > -1; i--) + { + // reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x], + // v_dim, &data[0], &result); + //reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result); + reduce_v1(gq * s_buffer_sh[FIXED_M * threadIdx.x + i], v_dim, &data[0], result); + + alpha_buffer_sh[threadIdx.x * FIXED_M + i] = + result[0] * rho_buffer_sh[i]; + + // gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim + + // batch*v_dim + threadIdx.x]; + gq = gq - alpha_buffer_sh[threadIdx.x * FIXED_M + i] * + y_buffer_sh[FIXED_M * threadIdx.x + i]; + } + //return; + + // compute var1 + //reduce_v1(y * y, v_dim, &data[0], &result); + reduce_v1(y * y, v_dim, &data[0], result); + + scalar_t denominator = result[0]; + + // reduce(s*y, v_dim, data, &result); // redundant - already computed it above + // scalar_t numerator = result; + scalar_t var1 = numerator / denominator; + + // To improve stability, uncomment below line: [this however leads to poor + // convergence] + + if (stable_mode && (denominator == 0.0)) + { + var1 = epsilon; + } + + scalar_t gamma = relu(var1); + gq = gamma * gq; + + #pragma unroll + for (int i = 0; i < FIXED_M ; i++) + { + // reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x], + // v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] - + // result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim + + // batch*v_dim + threadIdx.x]; + //reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result); + reduce_v1(gq * y_buffer_sh[FIXED_M * threadIdx.x + i], v_dim, &data[0], result); + + gq = gq + (alpha_buffer_sh[threadIdx.x * FIXED_M + i] - + result[0] * rho_buffer_sh[i]) * + s_buffer_sh[FIXED_M * threadIdx.x + i]; + } + + step_vec[batch * v_dim + threadIdx.x] = + -1.0 * gq; // copy from shared memory to global memory } template @@ -728,14 +534,19 @@ namespace Curobo const scalar_t *grad_q, // b x 175 const float epsilon, const int batchsize, const int m, const int v_dim, const bool stable_mode = - false) // s_buffer and y_buffer are not rolled by default + false) // s_buffer and y_buffer are not rolled by default { - // extern __shared__ scalar_t alpha_buffer_sh[]; - extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[]; - scalar_t *alpha_buffer_sh = reinterpret_cast(my_smem); - - __shared__ psum_t - data[32]; // temporary buffer needed for block-wide reduction + extern __shared__ float alpha_buffer_sh[]; + + //extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[]; + //scalar_t *alpha_buffer_sh = reinterpret_cast(my_smem); + + //extern __shared__ __align__(sizeof(float)) unsigned char my_smem[]; + //float *alpha_buffer_sh = reinterpret_cast(my_smem); + + __shared__ psum_t + data[32]; + // temporary buffer needed for block-wide reduction __shared__ scalar_t result; // result of the reduction or vector-vector dot product int batch = blockIdx.x; // one block per batch @@ -754,7 +565,7 @@ namespace Curobo // if y is close to zero scalar_t s = q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x]; - reduce(y * s, v_dim, &data[0], &result); + reduce_v1(y * s, v_dim, &data[0], &result); scalar_t numerator = result; // scalar_t rho = 1.0/numerator; @@ -805,11 +616,10 @@ namespace Curobo // scalar_t alpha_buffer[16]; // assert(m<16); // allocating a buffer assuming m < 16 -#pragma unroll - + #pragma unroll for (int i = m - 1; i > -1; i--) { - reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x], + reduce_v1(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x], v_dim, &data[0], &result); alpha_buffer_sh[threadIdx.x * m + i] = result * rho_buffer[i * batchsize + batch]; @@ -818,7 +628,7 @@ namespace Curobo } // compute var1 - reduce(y * y, v_dim, data, &result); + reduce_v1(y * y, v_dim, &data[0], &result); scalar_t denominator = result; // reduce(s*y, v_dim, data, &result); // redundant - already computed it above @@ -837,11 +647,10 @@ namespace Curobo gq = gamma * gq; -#pragma unroll - + #pragma unroll for (int i = 0; i < m; i++) { - reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x], + reduce_v1(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x], v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] - result * rho_buffer[i * batchsize + batch]) * @@ -851,157 +660,287 @@ namespace Curobo step_vec[batch * v_dim + threadIdx.x] = -1.0 * gq; // copy from shared memory to global memory } + + +template + __global__ void lbfgs_update_buffer_and_step_compile_m( + scalar_t *step_vec, // b x 175 + scalar_t *rho_buffer, // m x b x 1 + scalar_t *y_buffer, // m x b x 175 + scalar_t *s_buffer, // m x b x 175 + scalar_t *q, // b x 175 + scalar_t *x_0, // b x 175 + scalar_t *grad_0, // b x 175 + const scalar_t *grad_q, // b x 175 + const float epsilon, const int batchsize, const int m, const int v_dim, + const bool stable_mode = + false) // s_buffer and y_buffer are not rolled by default + { + extern __shared__ float alpha_buffer_sh[]; + + //extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[]; + //scalar_t *alpha_buffer_sh = reinterpret_cast(my_smem); + + //extern __shared__ __align__(sizeof(float)) unsigned char my_smem[]; + //float *alpha_buffer_sh = reinterpret_cast(my_smem); + + __shared__ psum_t data[32]; + // temporary buffer needed for block-wide reduction + __shared__ scalar_t + result; // result of the reduction or vector-vector dot product + int batch = blockIdx.x; // one block per batch + + if (threadIdx.x >= v_dim) + return; + + scalar_t gq; + gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq + + //////////////////// + // update_buffer + //////////////////// + scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x]; + + // if y is close to zero + scalar_t s = + q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x]; + reduce_v1(y * s, v_dim, &data[0], &result); + scalar_t numerator = result; + + // scalar_t rho = 1.0/numerator; + + if (!rolled_ys) + { + + # pragma unroll + for (int i = 1; i < FIXED_M; i++) + { + s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = + s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; + y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = + y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; + } + } + + s_buffer[(FIXED_M - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s; + y_buffer[(FIXED_M - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y; + + grad_0[batch * v_dim + threadIdx.x] = gq; + x_0[batch * v_dim + + threadIdx.x] = + q[batch * v_dim + threadIdx.x]; + + if (threadIdx.x < FIXED_M - 1) + { + // m thread participate to shif the values + // this is safe as m<32 and this happens in lockstep + rho_buffer[threadIdx.x * batchsize + batch] = + rho_buffer[(threadIdx.x + 1) * batchsize + batch]; + } + + if (threadIdx.x == FIXED_M - 1) + { + scalar_t rho = 1.0 / numerator; + + // if this is nan, make it zero: + if (stable_mode && (numerator == 0.0)) + { + rho = 0.0; + } + rho_buffer[threadIdx.x * batchsize + batch] = rho; + } + + + #pragma unroll + for (int i = FIXED_M - 1; i > -1; i--) + { + reduce_v1(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x], + v_dim, &data[0], &result); + alpha_buffer_sh[threadIdx.x * FIXED_M + i] = + result * rho_buffer[i * batchsize + batch]; + gq = gq - alpha_buffer_sh[threadIdx.x * FIXED_M + i] * + y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; + } + + // compute var1 + reduce_v1(y * y, v_dim, &data[0], &result); + scalar_t denominator = result; + + // reduce(s*y, v_dim, data, &result); // redundant - already computed it above + // scalar_t numerator = result; + scalar_t var1 = numerator / denominator; + + // To improve stability, uncomment below line: [this however leads to poor + // convergence] + + if (stable_mode && (denominator == 0.0)) + { + var1 = epsilon; + } + + scalar_t gamma = relu(var1); + + gq = gamma * gq; + + #pragma unroll + for (int i = 0; i < FIXED_M; i++) + { + reduce_v1(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x], + v_dim, &data[0], &result); + gq = gq + (alpha_buffer_sh[threadIdx.x * FIXED_M + i] - + result * rho_buffer[i * batchsize + batch]) * + s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x]; + } + + step_vec[batch * v_dim + threadIdx.x] = + -1.0 * gq; // copy from shared memory to global memory + } } // namespace Optimization } // namespace Curobo -std::vector -lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer, - torch::Tensor y_buffer, torch::Tensor s_buffer, - torch::Tensor grad_q, const float epsilon, const int batch_size, - const int m, const int v_dim) -{ - using namespace Curobo::Optimization; - const int threadsPerBlock = 128; - const int blocksPerGrid = - ((batch_size) + threadsPerBlock - 1) / threadsPerBlock; - - // launch threads per batch: - // int threadsPerBlock = pow(2,((int)log2(v_dim))+1); - - // const int blocksPerGrid = batch_size; //((batch_size) + threadsPerBlock - - // 1) / threadsPerBlock; - - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - AT_DISPATCH_FLOATING_TYPES( - step_vec.scalar_type(), "lbfgs_step_cu", ([&] { - lbfgs_step_kernel_old - << < blocksPerGrid, threadsPerBlock, - v_dim * sizeof(step_vec.scalar_type()), stream >> > ( - step_vec.data_ptr(), rho_buffer.data_ptr(), - y_buffer.data_ptr(), s_buffer.data_ptr(), - grad_q.data_ptr(), epsilon, batch_size, m, v_dim); - })); - - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return { step_vec }; -} - -std::vector -lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer, - torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q, - torch::Tensor x_0, torch::Tensor grad_0, const int batch_size, - const int m, const int v_dim) -{ - using namespace Curobo::Optimization; - - // const int threadsPerBlock = 128; - // launch threads per batch: - // int threadsPerBlock = pow(2,((int)log2(v_dim))+1); - int threadsPerBlock = v_dim; - - const int blocksPerGrid = - batch_size; // ((batch_size) + threadsPerBlock - 1) / threadsPerBlock; - - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - AT_DISPATCH_FLOATING_TYPES( - y_buffer.scalar_type(), "lbfgs_update_cu", ([&] { - lbfgs_update_buffer_kernel - << < blocksPerGrid, threadsPerBlock, - v_dim * sizeof(y_buffer.scalar_type()), stream >> > ( - rho_buffer.data_ptr(), y_buffer.data_ptr(), - s_buffer.data_ptr(), q.data_ptr(), - x_0.data_ptr(), grad_0.data_ptr(), - grad_q.data_ptr(), batch_size, m, v_dim); - })); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return { rho_buffer, y_buffer, s_buffer, x_0, grad_0 }; -} std::vector lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer, torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0, - const float epsilon, const int batch_size, const int m, - const int v_dim, const bool stable_mode) + const float epsilon, const int batch_size, const int history_m, + const int v_dim, const bool stable_mode, const bool use_shared_buffers) { using namespace Curobo::Optimization; // call first kernel: + //const bool use_experimental = true; + const bool use_fixed_m = true; int threadsPerBlock = v_dim; assert(threadsPerBlock < 1024); - assert(m < M_MAX); - int blocksPerGrid = - batch_size; // ((batch_size) + threadsPerBlock - 1) / threadsPerBlock; + assert(history_m < 32); + int blocksPerGrid = batch_size; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - int smemsize = 0; - if (true) + const int smemsize = history_m * v_dim * sizeof(float); + const int shared_buffer_smemsize = (((3 * v_dim) + 1) * history_m + 32) * sizeof(float); + const int max_shared_increase = shared_buffer_smemsize; + const int max_shared_base = 48000; + const int max_shared_allowed = 65536; // Turing limit, others support 98304 + int max_shared = max_shared_base; + + auto kernel_5 = Curobo::Optimization::lbfgs_update_buffer_and_step_v1_compile_m; + auto kernel_6 = Curobo::Optimization::lbfgs_update_buffer_and_step_v1_compile_m; + auto kernel_7 = Curobo::Optimization::lbfgs_update_buffer_and_step_v1_compile_m; + + auto kernel_15 = Curobo::Optimization::lbfgs_update_buffer_and_step_v1_compile_m; + auto kernel_27 = Curobo::Optimization::lbfgs_update_buffer_and_step_v1_compile_m; + + auto kernel_31 = Curobo::Optimization::lbfgs_update_buffer_and_step_v1_compile_m; + auto kernel_n = Curobo::Optimization::lbfgs_update_buffer_and_step_v1; + + + auto stable_kernel_5 = Curobo::Optimization::lbfgs_update_buffer_and_step_compile_m; + auto stable_kernel_6 = Curobo::Optimization::lbfgs_update_buffer_and_step_compile_m; + auto stable_kernel_7 = Curobo::Optimization::lbfgs_update_buffer_and_step_compile_m; + + auto stable_kernel_15 = Curobo::Optimization::lbfgs_update_buffer_and_step_compile_m; + auto stable_kernel_27 = Curobo::Optimization::lbfgs_update_buffer_and_step_compile_m; + + auto stable_kernel_31 = Curobo::Optimization::lbfgs_update_buffer_and_step_compile_m; + auto stable_kernel_n = Curobo::Optimization::lbfgs_update_buffer_and_step; + + + auto selected_kernel = kernel_n; + auto stable_selected_kernel = stable_kernel_n; + + + switch (history_m) { - AT_DISPATCH_FLOATING_TYPES( - y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] { - smemsize = m * threadsPerBlock * sizeof(scalar_t); - lbfgs_update_buffer_and_step - << < blocksPerGrid, threadsPerBlock, smemsize, stream >> > ( - step_vec.data_ptr(), - rho_buffer.data_ptr(), - y_buffer.data_ptr(), s_buffer.data_ptr(), - q.data_ptr(), x_0.data_ptr(), - grad_0.data_ptr(), grad_q.data_ptr(), - epsilon, batch_size, m, v_dim, stable_mode); - }); + + case 5: + selected_kernel = kernel_5; + stable_selected_kernel = stable_kernel_5; + break; + case 6: + selected_kernel = kernel_6; + stable_selected_kernel = stable_kernel_6; + break; + case 7: + selected_kernel = kernel_7; + stable_selected_kernel = stable_kernel_7; + break; + + case 15: + selected_kernel = kernel_15; + stable_selected_kernel = stable_kernel_15; + + break; + case 27: + selected_kernel = kernel_27; + stable_selected_kernel = stable_kernel_27; + + break; + case 31: + selected_kernel = kernel_31; + stable_selected_kernel = stable_kernel_31; + + break; } - else + if (!use_fixed_m) { - // v1 does not work - AT_DISPATCH_FLOATING_TYPES( - y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] { - smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) + - m * batch_size * sizeof(scalar_t); - lbfgs_update_buffer_and_step_v1 - << < blocksPerGrid, threadsPerBlock, smemsize, stream >> > ( - step_vec.data_ptr(), - rho_buffer.data_ptr(), - y_buffer.data_ptr(), s_buffer.data_ptr(), - q.data_ptr(), x_0.data_ptr(), - grad_0.data_ptr(), grad_q.data_ptr(), - epsilon, batch_size, m, v_dim, stable_mode); - }); + stable_selected_kernel = stable_kernel_n; + selected_kernel = kernel_n; + } + + // try to increase shared memory: + // Note that this feature is only available from volta+ (cuda 7.0+) + #if (VOLTA_PLUS) + { + + + if (use_shared_buffers && max_shared_increase > max_shared_base && max_shared_increase <= max_shared_allowed) + { + max_shared = max_shared_increase; + cudaError_t result; + result = cudaFuncSetAttribute(selected_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_increase); + if (result != cudaSuccess) + { + max_shared = max_shared_base; + } + + } + } + #endif + + if (use_shared_buffers && shared_buffer_smemsize <= max_shared) + { + + selected_kernel + << < blocksPerGrid, threadsPerBlock, shared_buffer_smemsize, stream >> > ( + step_vec.data_ptr(), + rho_buffer.data_ptr(), + y_buffer.data_ptr(), s_buffer.data_ptr(), + q.data_ptr(), x_0.data_ptr(), + grad_0.data_ptr(), grad_q.data_ptr(), + epsilon, batch_size, history_m, v_dim, stable_mode); + + + } + else + { + + stable_selected_kernel + << < blocksPerGrid, threadsPerBlock, smemsize, stream >> > ( + step_vec.data_ptr(), + rho_buffer.data_ptr(), + y_buffer.data_ptr(), s_buffer.data_ptr(), + q.data_ptr(), x_0.data_ptr(), + grad_0.data_ptr(), grad_q.data_ptr(), + epsilon, batch_size, history_m, v_dim, stable_mode); + } + + C10_CUDA_KERNEL_LAUNCH_CHECK(); return { step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0 }; } -std::vectorreduce_cuda(torch::Tensor vec, torch::Tensor vec2, - torch::Tensor rho_buffer, - torch::Tensor sum, const int batch_size, - const int m, const int v_dim) -{ - using namespace Curobo::Optimization; - - int threadsPerBlock = pow(2, ((int)log2(v_dim)) + 1); - int blocksPerGrid = - batch_size; // ((batch_size) + threadsPerBlock - 1) / threadsPerBlock; - // printf("threadsPerBlock:%d, blocksPerGrid: %d\n", - // threadsPerBlock, blocksPerGrid); - - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - AT_DISPATCH_FLOATING_TYPES( - vec.scalar_type(), "reduce_cu", ([&] { - reduce_kernel - << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( - vec.data_ptr(), vec2.data_ptr(), - rho_buffer.data_ptr(), sum.data_ptr(), - batch_size, m, v_dim); - })); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return { sum, rho_buffer }; -} diff --git a/src/curobo/curobolib/cpp/line_search_kernel.cu b/src/curobo/curobolib/cpp/line_search_kernel.cu index ff061a1..ab13d1a 100644 --- a/src/curobo/curobolib/cpp/line_search_kernel.cu +++ b/src/curobo/curobolib/cpp/line_search_kernel.cu @@ -9,7 +9,6 @@ * its affiliates is strictly prohibited. */ -#pragma once #include #include #include diff --git a/src/curobo/curobolib/cpp/self_collision_kernel.cu b/src/curobo/curobolib/cpp/self_collision_kernel.cu index b9aaeb5..7b6d19f 100644 --- a/src/curobo/curobolib/cpp/self_collision_kernel.cu +++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu @@ -185,11 +185,11 @@ namespace Curobo if (coll_matrix[i * nspheres + j] == 1) { float4 sph1 = __rs_shared[i]; - - if ((sph1.w <= 0.0) || (sph2.w <= 0.0)) - { - continue; - } + // + //if ((sph1.w <= 0.0) || (sph2.w <= 0.0)) + //{ + // continue; + //} float r_diff = sph1.w + sph2.w; float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) + (sph1.y - sph2.y) * (sph1.y - sph2.y) + @@ -380,10 +380,10 @@ namespace Curobo float4 sph1 = __rs_shared[NBPB * i + l]; float4 sph2 = __rs_shared[NBPB * j + l]; - if ((sph1.w <= 0.0) || (sph2.w <= 0.0)) - { - continue; - } + //if ((sph1.w <= 0.0) || (sph2.w <= 0.0)) + //{ + // continue; + //} float r_diff = sph1.w + sph2.w; // sum of two radii, radii include respective offsets float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) + diff --git a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu index ff1022f..698aa0b 100644 --- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu +++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu @@ -16,8 +16,25 @@ #include "helper_math.h" #include +#include #include +#include +#include + +// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4. +#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) \ + AT_ASSERTM(x.is_contiguous(), # x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) + +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2 +#include +#endif #define M 4 +#define VOXEL_DEBUG true +#define VOXEL_UNOBSERVED_DISTANCE -1000.0 // #define MAX_WARP_THREADS 512 // warp level batching. 8 x M = 32 #define MAX_BOX_SHARED 256 // maximum number of boxes we can store distance and closest point @@ -45,6 +62,109 @@ namespace Curobo return max(0.0f, sphere_length(v1, v2) - v1.w - v2.w); } + + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2 + + __device__ __forceinline__ float + get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx) + { + + __nv_fp8_storage_t value_in = (reinterpret_cast (grid_features))[voxel_idx]; + + float value = __half2float(__nv_cvt_fp8_to_halfraw(value_in, __NV_E4M3)); + + return value; + } + + __device__ __forceinline__ void + get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx, float &value) + { + + __nv_fp8_storage_t value_in = (reinterpret_cast (grid_features))[voxel_idx]; + + value = __half2float(__nv_cvt_fp8_to_halfraw(value_in, __NV_E4M3)); + + } + + #endif + + __device__ __forceinline__ float + get_array_value(const at::BFloat16 *grid_features, const int voxel_idx) + { + + __nv_bfloat16 value_in = (reinterpret_cast (grid_features))[voxel_idx]; + + float value = __bfloat162float(value_in); + + return value; + } + + __device__ __forceinline__ float + get_array_value(const at::Half *grid_features, const int voxel_idx) + { + + __nv_half value_in = (reinterpret_cast (grid_features))[voxel_idx]; + + float value = __half2float(value_in); + + return value; + } + + __device__ __forceinline__ float + get_array_value(const float *grid_features, const int voxel_idx) + { + + float value = grid_features[voxel_idx]; + + return value; + } + + __device__ __forceinline__ float + get_array_value(const double *grid_features, const int voxel_idx) + { + + float value = (float) grid_features[voxel_idx]; + + return value; + } + + + __device__ __forceinline__ void + get_array_value(const at::BFloat16 *grid_features, const int voxel_idx, float &value) + { + + __nv_bfloat16 value_in = (reinterpret_cast (grid_features))[voxel_idx]; + + value = __bfloat162float(value_in); + + } + + __device__ __forceinline__ void + get_array_value(const at::Half *grid_features, const int voxel_idx, float &value) + { + + __nv_half value_in = (reinterpret_cast (grid_features))[voxel_idx]; + + value = __half2float(value_in); + + } + + __device__ __forceinline__ void + get_array_value(const float *grid_features, const int voxel_idx, float &value) + { + + value = grid_features[voxel_idx]; + + } + + __device__ __forceinline__ void + get_array_value(const double *grid_features, const int voxel_idx, float &value) + { + + value = (float) grid_features[voxel_idx]; + + } + template __device__ __forceinline__ void load_obb_pose(const scalar_t *obb_mat, float3& position, float4& quat) @@ -72,6 +192,7 @@ namespace Curobo bounds.z = loc_bounds.z / 2; } + template __device__ __forceinline__ void transform_sphere_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz @@ -164,9 +285,9 @@ namespace Curobo } else { - C.x = sphere_pos.x; - C.y = sphere_pos.y; - C.z = sphere_pos.z; + C.x = sphere_pos.x ; + C.y = sphere_pos.y ; + C.z = sphere_pos.z ; } } @@ -183,73 +304,6 @@ namespace Curobo C = C + temp_C; } - template - __device__ __forceinline__ void - inv_transform_vec_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz - const float4& sphere_pos, float3& C) - { - // do dot product: - // new_p = q * p * q_inv + obs_p - const scalar_t w = transform_mat[3]; - const scalar_t x = -1 * transform_mat[4]; - const scalar_t y = -1 * transform_mat[5]; - const scalar_t z = -1 * transform_mat[6]; - - if ((x != 0) || (y != 0) || (z != 0)) - { - C.x = w * w * sphere_pos.x + 2 * y * w * sphere_pos.z - - 2 * z * w * sphere_pos.y + x * x * sphere_pos.x + - 2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z - - z * z * sphere_pos.x - y * y * sphere_pos.x; - C.y = 2 * x * y * sphere_pos.x + y * y * sphere_pos.y + - 2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x - - z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z - - x * x * sphere_pos.y; - C.z = 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y + - z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z + - 2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z; - } - else - { - C.x = sphere_pos.x; - C.y = sphere_pos.y; - C.z = sphere_pos.z; - } - } - - template - __device__ __forceinline__ void - inv_transform_vec_quat_add(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz - const float4& sphere_pos, float3& C) - { - // do dot product: - // new_p = q * p * q_inv + obs_p - const scalar_t w = transform_mat[3]; - const scalar_t x = -1 * transform_mat[4]; - const scalar_t y = -1 * transform_mat[5]; - const scalar_t z = -1 * transform_mat[6]; - - if ((x != 0) || (y != 0) || (z != 0)) - { - C.x += w * w * sphere_pos.x + 2 * y * w * sphere_pos.z - - 2 * z * w * sphere_pos.y + x * x * sphere_pos.x + - 2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z - - z * z * sphere_pos.x - y * y * sphere_pos.x; - C.y += 2 * x * y * sphere_pos.x + y * y * sphere_pos.y + - 2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x - - z * z * sphere_pos.y + w * w * sphere_pos.y - - 2 * x * w * sphere_pos.z - x * x * sphere_pos.y; - C.z += 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y + - z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - - y * y * sphere_pos.z + 2 * w * x * sphere_pos.y - - x * x * sphere_pos.z + w * w * sphere_pos.z; - } - { - C.x += sphere_pos.x; - C.y += sphere_pos.y; - C.z += sphere_pos.z; - } - } /** * @brief Scales the Collision across the trajectory by sphere velocity. This is @@ -277,7 +331,10 @@ namespace Curobo norm_vel_vec = (0.5 / dt) * norm_vel_vec; const float sph_vel = length(norm_vel_vec); - + if(sph_vel < 0.001) + { + return; + } if (transform_back) { float3 sph_acc_vec = make_float3( @@ -300,11 +357,11 @@ namespace Curobo vel_arr[2] = norm_vel_vec.z; // calculate projection ( I - (v * v^T)): -#pragma unroll 3 +#pragma unroll for (int i = 0; i < 3; i++) { -#pragma unroll 3 +#pragma unroll for (int j = 0; j < 3; j++) { @@ -322,7 +379,7 @@ namespace Curobo float orth_pt[3]; // orth_proj(3x3) * max_grad(3x1) float orth_curve[3]; // max_dist(1) * orth_proj (3x3) * curvature_vec (3x1) -#pragma unroll 3 +#pragma unroll for (int i = 0; i < 3; i++) // matrix - vector product { @@ -348,6 +405,120 @@ namespace Curobo // + + + __device__ __forceinline__ void + compute_closest_point(const float3& bounds, const float4& sphere, + float3& delta, float& distance, float& sph_distance) + { + float3 pt = make_float3(sphere.x, sphere.y, sphere.z); + bool inside = true; + + if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y), + fabs(sphere.z) - bounds.z) >= (0.0)) + { + inside = false; + } + + + float3 val = make_float3(sphere.x,sphere.y,sphere.z); + val = bounds - fabs(val); + + if(!inside) + { + + + if (val.x < 0) // it's outside limits, clamp: + { + pt.x = copysignf(bounds.x, sphere.x); + } + + + if (val.y < 0) // it's outside limits, clamp: + { + pt.y = copysignf(bounds.y, sphere.y); + } + + if (val.z < 0) // it's outside limits, clamp: + { + pt.z = copysignf(bounds.z, sphere.z); + } + } + else + { + + + val = fabs(val); + + + + + if (val.y <= val.x && val.y <= val.z) + { + + if(sphere.y > 0) + { + pt.y = bounds.y; + } + else + { + pt.y = -1 * bounds.y; + } + } + + else if (val.x <= val.y && val.x <= val.z) + { + if(sphere.x > 0) + { + pt.x = bounds.x; + } + else + { + pt.x = -1 * bounds.x; + } + } + else if (val.z <= val.x && val.z <= val.y) + { + + if(sphere.z > 0) + { + pt.z = bounds.z; + } + else + { + pt.z = -1 * bounds.z; + } + } + + + + + } + + delta = make_float3(pt.x - sphere.x, pt.y - sphere.y, pt.z - sphere.z); + + distance = length(delta); + + if (!inside) + { + distance *= -1.0; + } + else + { + delta = -1 * delta; + } + + if (distance != 0.0) + { + delta = normalize(delta); + + } + sph_distance = distance + sphere.w; + // + + + } + /** * @brief check if sphere is inside. For numerical stability, we assume that if * sphere is exactly at bound of cuboid, we are not in collision. Note: this is @@ -358,226 +529,404 @@ namespace Curobo * @return bool */ __device__ __forceinline__ bool - check_sphere_aabb(const float3 bounds, const float4 sphere) + check_sphere_aabb(const float3 bounds, const float4 sphere, bool &inside, + float3& delta, float& distance, float& sph_distance) { // if((fabs(sphere.x) - bounds.x) >= sphere.w || fabs(sphere.y) - bounds.y >= // sphere.w || (fabs(sphere.z) - bounds.z) >= sphere.w) + + inside = false; + + if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y), fabs(sphere.z) - bounds.z) >= (sphere.w)) { return false; } - return true; + // if it's within aabb, check more accurately: + // compute closest point: + + compute_closest_point(bounds, sphere, delta, distance, sph_distance); + if (sph_distance > 0) + { + inside = true; + } + + return inside; } - - __device__ __forceinline__ void - compute_closest_point(const float3& bounds, const float4& sphere, float4& pt) - { - // assuming the cuboid is centered at origin - // Find the closest face to the sphere position: - // If sphere is within bounds of obstacle, - float min_val, curr_val; - int min_idx; - - // We check the distance to each face and store the closest face - // All we want is the index of the closest face - min_val = bounds.x - fabsf(sphere.x); - - if (min_val < 0) // it's outside limits, clamp: - { - pt.x = copysignf(bounds.x, sphere.x); - } - - // check if bounds.x - sphere.x > bounds.x + sphere.x - min_val = fabsf(min_val); - - if (sphere.x > 0) - { - min_idx = 0; - } - else - { - min_idx = 1; - } - - curr_val = bounds.y - fabsf(sphere.y); // check if sphere-y is outside y-lim - - if (curr_val < 0) // outside obb-y, we clamp point to bounds - { - pt.y = copysignf(bounds.y, sphere.y); - } - curr_val = fabsf(curr_val); - - if (curr_val < min_val) - { - min_val = curr_val; - - if (sphere.y > 0) - { - min_idx = 2; - } - else - { - min_idx = 3; - } - } - curr_val = bounds.z - fabsf(sphere.z); // distance to -ve bound - - if (curr_val < 0) - { - pt.y = copysignf(bounds.z, sphere.z); - } - curr_val = fabsf(curr_val); - - if (curr_val < min_val) - { - min_val = curr_val; - - if (sphere.z > 0) - { - min_idx = 4; - } - else - { - min_idx = 5; - } - } - - // we move pt's value to bounds on the closest face dimension: - switch (min_idx) - { - case 0: - pt.x = bounds.x; - break; - - case 1: - pt.x = -1 * bounds.x; - break; - - case 2: - pt.y = bounds.y; - break; - - case 3: - pt.y = -1 * bounds.y; - break; - - case 4: - pt.z = bounds.z; - break; - - case 5: - pt.z = -1 * bounds.z; - break; - - default: - break; - } - } - __device__ __forceinline__ float - compute_distance(const float3& bounds_w_radius, - const float4& sphere) // pass in cl_pt - {// compute closest point: - float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0); - - compute_closest_point(bounds_w_radius, sphere, cl_pt); - - // clamp: - + compute_distance_fn( + const float3& bounds, + const float4& sphere, + const float max_distance, + float3& delta, + float& sph_dist, + float& distance, + bool& inside) // pass in cl_pt + { + // compute distance: - return sphere_length(sphere, cl_pt); // cl_pt includes radius, and sphere also has radius + float4 loc_sphere = sphere; + loc_sphere.w = max_distance; + distance = max_distance; + check_sphere_aabb(bounds, loc_sphere, inside, delta, distance, sph_dist); + + //distance = fabsf(distance); + return distance; } + +template +__device__ __forceinline__ void scale_eta_metric_vector( +const float eta, +float4 &sum_pt) +{ + float sph_dist = sum_pt.w; + + if (sph_dist == 0) + { + sum_pt.x = 0; + sum_pt.y = 0; + sum_pt.z = 0; + sum_pt.w = 0; + + return; + } + sum_pt.w = sph_dist - eta; + //sum_pt.x = sum_pt.x / sph_dist; + //sum_pt.y = sum_pt.y / sph_dist; + //sum_pt.z = sum_pt.z / sph_dist; + + if (SCALE_METRIC) + { + if (eta > 0.0 && sph_dist > 0) + { + //sum_pt.x = sum_pt.x * (1/sph_dist); + //sum_pt.y = sum_pt.y * (1/sph_dist); + //sum_pt.z = sum_pt.z * (1/sph_dist); + + if (sph_dist> eta) + { + sum_pt.w = sph_dist - 0.5 * eta; + + + } else if (sph_dist <= eta) + { + + sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist); + const float scale = (1 / eta) * (sph_dist); + sum_pt.x = scale * sum_pt.x; + sum_pt.y = scale * sum_pt.y; + sum_pt.z = scale * sum_pt.z; + } + + } + + } +} + + template + __device__ __forceinline__ void scale_eta_metric( + const float3 delta, + const float sph_dist, + const float eta, + float4& sum_pt) + { + // compute distance: + //float sph_dist = 0; + + sum_pt.x = delta.x; + sum_pt.y = delta.y; + sum_pt.z = delta.z; + sum_pt.w = sph_dist; + + + if(SCALE_METRIC) + { + + if (sph_dist > 0) + { + + if (sph_dist > eta) + { + sum_pt.w = sph_dist - 0.5 * eta; + } + else if (sph_dist <= eta) + { + sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist); + const float scale = (1.0 / eta) * (sph_dist); + sum_pt.x = scale * sum_pt.x; + sum_pt.y = scale * sum_pt.y; + sum_pt.z = scale * sum_pt.z; + } + + } + else + { + sum_pt.x = 0.0; + sum_pt.y = 0.0; + sum_pt.z = 0.0; + sum_pt.w = 0.0; + + } + + } + + + + + + } + + + template __device__ __forceinline__ void scale_eta_metric(const float4& sphere, const float4& cl_pt, const float eta, + const bool inside, float4& sum_pt) { // compute distance: - float sph_dist = 0; + float distance = 0; - sph_dist = sphere_length(sphere, cl_pt); + scale_eta_metric(sphere, cl_pt, eta, inside, sum_pt, distance); + + + } - if (sph_dist == 0) + + template + __device__ __forceinline__ void + compute_voxel_location_params( + const grid_scalar_t *grid_features, + const float4& loc_grid_params, + const float4& loc_sphere, + int3 &xyz_loc, + int3 &xyz_grid, + float &interpolated_distance, + int &voxel_idx) + { + + + // convert location to index: can use floor to cast to int. + // to account for negative values, add 0.5 * bounds. + const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z); + const float3 sphere = make_float3(loc_sphere.x, loc_sphere.y, loc_sphere.z); + // xyz_loc = make_int3(floorf((sphere + 0.5 * loc_grid) / loc_grid_params.w)); + xyz_loc = make_int3(floorf((sphere) / loc_grid_params.w) + (0.5 * loc_grid/loc_grid_params.w)); + xyz_grid = make_int3(floorf(loc_grid / loc_grid_params.w)); + + // find next nearest voxel to current point and then do weighted interpolation: + voxel_idx = xyz_loc.x * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z; + + + // compute interpolation distance between voxel origin and sphere location: + get_array_value(grid_features, voxel_idx, interpolated_distance);// + 0.5 * loc_grid_params.w; + if(!INTERPOLATION) { - sum_pt.x = 0; - sum_pt.y = 0; - sum_pt.z = 0; - sum_pt.w = 0; + interpolated_distance += 0.5 * loc_grid_params.w;//max(0.0, (0.3 * loc_grid_params.w) - loc_sphere.w); + } + if(INTERPOLATION) + { + // + float3 voxel_origin = (make_float3(xyz_loc) * loc_grid_params.w) - (loc_grid/2); + + + float3 delta = sphere - voxel_origin; + int3 next_loc = make_int3(floorf((make_float3(xyz_loc) + normalize(delta)))); + float ratio = length(delta)/loc_grid_params.w; + int next_voxel_idx = next_loc.x * xyz_grid.y * xyz_grid.z + next_loc.y * xyz_grid.z + next_loc.z; + + interpolated_distance = ratio * interpolated_distance + (1 - ratio) * get_array_value(grid_features, next_voxel_idx) + + max(0.0, (ratio * loc_grid_params.w) - loc_sphere.w);; + + } + + + + + } + + template + __device__ __forceinline__ void + compute_voxel_index( + const grid_scalar_t *grid_features, + const float4& loc_grid_params, + const float4& loc_sphere, + int &voxel_idx, + int3 &xyz_loc, + int3 &xyz_grid, + float &interpolated_distance) + { + // check if sphere is out of bounds + // loc_grid_params.x contains bounds + float4 local_bounds = 0.5*loc_grid_params - 2*loc_grid_params.w; + + if (loc_sphere.x <= -1 * (local_bounds.x) || + loc_sphere.x >= (local_bounds.x) || + loc_sphere.y <= -1 * (local_bounds.y) || + loc_sphere.y >= (local_bounds.y) || + loc_sphere.z <= -1 * (local_bounds.z) || + loc_sphere.z >= (local_bounds.z)) + { + voxel_idx = -1; return; } - sum_pt.x = (sphere.x - cl_pt.x) / sph_dist; - sum_pt.y = (sphere.y - cl_pt.y) / sph_dist; - sum_pt.z = (sphere.z - cl_pt.z) / sph_dist; + + compute_voxel_location_params(grid_features, loc_grid_params, loc_sphere, xyz_loc, xyz_grid, interpolated_distance, voxel_idx); + // convert location to index: can use floor to cast to int. + // to account for negative values, add 0.5 * bounds. + + + - if (eta > 0.0) - { - // sph_dist = sph_dist - eta; - if (sph_dist > eta) - { - sum_pt.w = sph_dist - 0.5 * eta; - } - else if (sph_dist <= eta) - { - sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist); - const float scale = (1 / eta) * (sph_dist); - sum_pt.x = scale * sum_pt.x; - sum_pt.y = scale * sum_pt.y; - sum_pt.z = scale * sum_pt.z; - } - } - else + } + + + + + + template + __device__ __forceinline__ void + compute_voxel_fd_gradient( + const grid_scalar_t *grid_features, + const int voxel_layer_start_idx, + const int3& xyz_loc, + const int3& xyz_grid, + const float voxel_size, + float4 &cl_pt) + { + + float3 d_grad; + if (CENTRAL_DIFFERENCE) + { + + // x difference: + int x_plus, x_minus, y_plus, y_minus, z_plus, z_minus; + + x_plus = (xyz_loc.x + 1) * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z; + x_minus = (xyz_loc.x - 1)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z; + + y_plus = (xyz_loc.x) * xyz_grid.y * xyz_grid.z + (xyz_loc.y + 1) * xyz_grid.z + xyz_loc.z; + y_minus = (xyz_loc.x )* xyz_grid.y * xyz_grid.z + (xyz_loc.y -1) * xyz_grid.z + xyz_loc.z; + + z_plus = (xyz_loc.x) * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z + 1; + z_minus = (xyz_loc.x)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z - 1; + + + float3 d_plus = make_float3( + get_array_value(grid_features,voxel_layer_start_idx + x_plus), + get_array_value(grid_features, voxel_layer_start_idx + y_plus), + get_array_value(grid_features,voxel_layer_start_idx + z_plus)); + float3 d_minus = make_float3( + get_array_value(grid_features,voxel_layer_start_idx + x_minus), + get_array_value(grid_features, voxel_layer_start_idx + y_minus), + get_array_value(grid_features,voxel_layer_start_idx + z_minus)); + + + d_grad = (d_plus - d_minus) * (1/(2*voxel_size)); + } + else + { + // x difference: + int x_minus,y_minus, z_minus; + + x_minus = (xyz_loc.x - 1)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z; + y_minus = (xyz_loc.x )* xyz_grid.y * xyz_grid.z + (xyz_loc.y -1) * xyz_grid.z + xyz_loc.z; + z_minus = (xyz_loc.x)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z - 1; + + + float3 d_plus = make_float3(cl_pt.w, cl_pt.w, cl_pt.w); + float3 d_minus = make_float3( + get_array_value(grid_features,voxel_layer_start_idx + x_minus), + get_array_value(grid_features, voxel_layer_start_idx + y_minus), + get_array_value(grid_features,voxel_layer_start_idx + z_minus)); + + + d_grad = (d_plus - d_minus) * (1/voxel_size); + } + + + if (NORMALIZE) + { + if (!(d_grad.x ==0 && d_grad.y == 0 && d_grad.z == 0)) { - sum_pt.w = sph_dist; + d_grad = normalize(d_grad); } } - - /** - * @brief Compute distance and gradient, with a smooth l2 loss when distance >0 - * and < eta - * - * @param bounds_w_radius - * @param sphere - * @param sum_pt - * @param eta - * @return __device__ - */ - __device__ __forceinline__ void - compute_sphere_gradient(const float3& bounds_w_radius, const float4& sphere, - float4& sum_pt, const float eta = 0.0) + cl_pt.x = d_grad.x; + cl_pt.y = d_grad.y; + cl_pt.z = d_grad.z; + if (ADD_NOISE) { - // compute closest point: - float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0); - - compute_closest_point(bounds_w_radius, sphere, cl_pt); - - scale_eta_metric(sphere, cl_pt, eta, sum_pt); + if (cl_pt.z == 0 && cl_pt.x == 0 && cl_pt.y == 0) + { + cl_pt.x = 0.001; + cl_pt.y = 0.001; + } + } + } + template __device__ __forceinline__ void - compute_sphere_gradient_add(const float3& bounds_w_radius, const float4& sphere, - float4& sum_pt, const float k0 = 1.0, - const float eta = 0.0) + compute_sphere_voxel_gradient(const grid_scalar_t *grid_features, + const int voxel_layer_start_idx, + const int num_voxels, + const float4& loc_grid_params, + const float4& loc_sphere, + float4 &sum_pt, + float &signed_distance, + const float eta = 0.0, + const float max_distance = -10.0, + const bool transform_back = true) { - // compute closest point: - float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0); + int voxel_idx = 0; + int3 xyz_loc = make_int3(0,0,0); + int3 xyz_grid = make_int3(0,0,0); + float interpolated_distance = 0.0; + compute_voxel_index(grid_features, loc_grid_params, loc_sphere, voxel_idx, xyz_loc, xyz_grid, interpolated_distance); + if (voxel_idx < 0 || voxel_idx >= num_voxels) + { + sum_pt.w = VOXEL_UNOBSERVED_DISTANCE; + signed_distance = VOXEL_UNOBSERVED_DISTANCE; + return; + } + + + //sum_pt.w = get_array_value(grid_features,voxel_layer_start_idx + voxel_idx); + sum_pt.w = interpolated_distance; + + if ((!SCALE_METRIC && transform_back)|| (transform_back && sum_pt.w > -loc_sphere.w )) + { + // compute closest point: + compute_voxel_fd_gradient(grid_features, voxel_layer_start_idx, xyz_loc, xyz_grid, loc_grid_params.w, sum_pt); + } - compute_closest_point(bounds_w_radius, sphere, cl_pt); + signed_distance = sum_pt.w; + + sum_pt.w += loc_sphere.w; + + + scale_eta_metric_vector(eta, sum_pt); + - // We need to check if sphere is colliding or outside - // if distance > 0.0, then it's colliding - // We can achieve this by adding eta to bounds_w_radius and then subtracting - // it - float4 local_sum_pt = make_float4(0.0, 0.0, 0.0, 0.0); - scale_eta_metric(sphere, cl_pt, eta, local_sum_pt); - sum_pt += local_sum_pt; } + + + + __device__ __forceinline__ void check_jump_distance( const float4& loc_sphere_1, const float4 loc_sphere_0, const float k0, - const float eta, const float3& loc_bounds, const float3& grad_loc_bounds, + const float3& bounds, + const float max_distance, + float3& delta, + float& sph_dist, + float& distance, + bool& inside, + const float eta, float4& sum_pt, float& curr_jump_distance) // we can pass in interpolated sphere here, also // need to pass cl_pt for use in @@ -586,17 +935,27 @@ namespace Curobo const float4 interpolated_sphere = (k0) * loc_sphere_1 + (1 - k0) * loc_sphere_0; - if (check_sphere_aabb(loc_bounds, interpolated_sphere)) + if (check_sphere_aabb(bounds, interpolated_sphere, inside, delta, distance, sph_dist)) { - compute_sphere_gradient_add(grad_loc_bounds, interpolated_sphere, sum_pt, - k0, eta); // true, loc_sphere_1 works better - curr_jump_distance += loc_sphere_1.w; // move by diameter of sphere + float4 loc_grad = make_float4(0,0,0,0); + scale_eta_metric(delta, sph_dist, eta, loc_grad); + sum_pt += loc_grad; + } else { - const float dist = compute_distance(grad_loc_bounds, interpolated_sphere); - curr_jump_distance += max(dist - 2 * loc_sphere_1.w, loc_sphere_1.w); + compute_distance_fn( + bounds, + interpolated_sphere, + max_distance, + delta, + sph_dist, + distance, + inside + ); } + curr_jump_distance += max(fabsf(distance), interpolated_sphere.w); + } /////////////////////////////////////////////////////////////// @@ -608,15 +967,15 @@ namespace Curobo // use of shared memory. /////////////////////////////////////////////////////////////// - template + template __device__ __forceinline__ void sphere_obb_collision_fn( const scalar_t *sphere_position, const int env_idx, const int bn_sph_idx, const int sph_idx, - scalar_t *out_distance, const scalar_t *weight, - const scalar_t *activation_distance, const scalar_t *obb_accel, - const scalar_t *obb_bounds, const scalar_t *obb_mat, + dist_scalar_t *out_distance, const float *weight, + const float *activation_distance, const float *obb_accel, + const float *obb_bounds, const float *obb_mat, const uint8_t *obb_enable, const int max_nobs, const int nboxes) { float max_dist = 0; @@ -639,7 +998,11 @@ namespace Curobo float4 obb_quat = make_float4(0.0); float3 obb_pos = make_float3(0.0); float3 loc_bounds = make_float3(0.0); - + bool inside = false; + float distance = 0.0; + float sph_dist = 0.0; + float3 delta = make_float3(0,0,0); + for (int box_idx = 0; box_idx < nboxes; box_idx++) { if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle @@ -651,32 +1014,36 @@ namespace Curobo load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds); transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere); - + // first check if point is inside or outside box: - if (check_sphere_aabb(loc_bounds, loc_sphere)) + if (check_sphere_aabb(loc_bounds, loc_sphere, inside, delta, distance, sph_dist)) { - max_dist = 1; - break; // we exit without checking other cuboids if we found a collision. + // using same primitive functions: + max_dist = 1; + break; // we exit without checking other cuboids if we found a collision. } + } + out_distance[bn_sph_idx] = weight[0] * max_dist; } - template + template __device__ __forceinline__ void sphere_obb_distance_fn( const scalar_t *sphere_position, const int32_t env_idx, const int bn_sph_idx, const int sph_idx, - scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx, - const scalar_t *weight, const scalar_t *activation_distance, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs, + dist_scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx, + const float *weight, const float *activation_distance, + const float *obb_accel, const float *obb_bounds, + const float *obb_mat, const uint8_t *obb_enable, const int max_nobs, const int nboxes, const bool transform_back) { float max_dist = 0.0; + const float eta = activation_distance[0]; float3 max_grad = make_float3(0.0, 0.0, 0.0); @@ -703,7 +1070,11 @@ namespace Curobo float4 obb_quat = make_float4(0.0); float3 obb_pos = make_float3(0.0); float3 loc_bounds = make_float3(0.0); - + float4 loc_grad = make_float4(0,0,0,0); + bool inside = false; + float distance = 0.0; + float sph_dist = 0.0; + float3 delta = make_float3(0,0,0); for (int box_idx = 0; box_idx < nboxes; box_idx++) { if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle @@ -716,26 +1087,45 @@ namespace Curobo load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds); transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere); - + // first check if point is inside or outside box: - if (check_sphere_aabb(loc_bounds, loc_sphere)) + if (check_sphere_aabb(loc_bounds, loc_sphere, inside, delta, distance, sph_dist)) { // compute closest point: - loc_bounds = loc_bounds + loc_sphere.w; + //loc_bounds = loc_bounds + loc_sphere.w; // using same primitive functions: - float4 cl; - compute_sphere_gradient(loc_bounds, loc_sphere, cl, eta); + scale_eta_metric(delta, sph_dist, eta, loc_grad); - max_dist += cl.w; - - if (transform_back) + + if (SUM_COLLISIONS) { - inv_transform_vec_quat(obb_pos, obb_quat, cl, max_grad); + if (loc_grad.w > 0) + { + max_dist += loc_grad.w; - // inv_transform_vec_quat_add(&obb_mat[(start_box_idx + box_idx) * 7], cl, - // max_grad); + if (transform_back) + { + + inv_transform_vec_quat_add(obb_pos, obb_quat, loc_grad, max_grad); + } + } } + else + { + if (loc_grad.w > max_dist) + { + max_dist = loc_grad.w; + + if (transform_back) + { + inv_transform_vec_quat(obb_pos, obb_quat, loc_grad, max_grad); + + } + } + + } + } } @@ -767,18 +1157,824 @@ namespace Curobo sparsity_idx[bn_sph_idx] = 1; } - template + + template + __device__ __forceinline__ void sphere_obb_esdf_fn( + const scalar_t *sphere_position, + const int32_t env_idx, + const int bn_sph_idx, + const int sph_idx, + dist_scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx, + const float *weight, const float *activation_distance, + const float *max_distance, + const float *obb_accel, const float *obb_bounds, + const float *obb_mat, const uint8_t *obb_enable, const int max_nobs, + const int nboxes, const bool transform_back) + { + + const float eta = max_distance[0]; + float max_dist = -1 * eta; + + float3 max_grad = make_float3(0.0, 0.0, 0.0); + + // Load sphere_position input + float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4]; + if (sphere_cache.w <= 0.0) + { + // write zeros for cost: + out_distance[bn_sph_idx] = 0; + + // write zeros for gradient if not zero: + if (sparsity_idx[bn_sph_idx] != 0) + { + sparsity_idx[bn_sph_idx] = 0; + *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0); + } + return; + } + sphere_cache.w += eta; + + //const float sphere_radius = sphere_cache.w + eta; + + const int start_box_idx = max_nobs * env_idx; + + float4 loc_sphere = make_float4(0.0); + float4 obb_quat = make_float4(0.0); + float3 obb_pos = make_float3(0.0); + float3 loc_bounds = make_float3(0.0); + bool inside = false; + float distance = 0.0; + float sph_dist = 0.0; + float3 delta = make_float3(0,0,0); + float4 loc_grad = make_float4(0,0,0,0); + + for (int box_idx = 0; box_idx < nboxes; box_idx++) + { + if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle + { + continue; + } + + load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos, + obb_quat); + load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds); + + transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere); + + // first check if point is inside or outside box: + if (check_sphere_aabb(loc_bounds, loc_sphere, inside, delta, distance, sph_dist)) + { + // compute closest point: + + + // using same primitive functions: + scale_eta_metric(delta, sph_dist, eta, loc_grad); + + + if (loc_grad.w > max_dist) + { + max_dist = loc_grad.w; + + if (transform_back) + { + inv_transform_vec_quat(obb_pos, obb_quat, loc_grad, max_grad); + + } + } + } + } + // subtract radius: + max_dist = max_dist - sphere_cache.w; + if (transform_back) + { + *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; + } + out_distance[bn_sph_idx] = max_dist; + } + + template + __device__ __forceinline__ void sphere_voxel_distance_fn( + const geom_scalar_t *sphere_position, + const int32_t env_idx, + const int bn_sph_idx, + const int sph_idx, + dist_scalar_t *out_distance, grad_scalar_t *closest_pt, uint8_t *sparsity_idx, + const float *weight, + const float *activation_distance, + const float *max_distance, + const grid_scalar_t *grid_features, + const float *grid_params, + const float *obb_mat, + const uint8_t *obb_enable, + const int max_nobs, + const int num_voxels, + const bool transform_back) + { + float max_dist = 0.0; + float max_distance_local = max_distance[0]; + max_distance_local = -1 * max_distance_local; + const float eta = activation_distance[0]; + float3 max_grad = make_float3(0.0, 0.0, 0.0); + + // Load sphere_position input + float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4]; + + if (sphere_cache.w <= 0.0) + { + // write zeros for cost: + out_distance[bn_sph_idx] = 0; + + // write zeros for gradient if not zero: + if (sparsity_idx[bn_sph_idx] != 0) + { + sparsity_idx[bn_sph_idx] = 0; + *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0); + } + return; + } + sphere_cache.w += eta; + const int local_env_idx = max_nobs * env_idx; + float signed_distance = 0; + + float4 loc_sphere = make_float4(0.0); + float4 obb_quat = make_float4(0.0); + float3 obb_pos = make_float3(0.0); + float4 loc_grid_params = make_float4(0.0); + + if (NUM_LAYERS <= 4) + { + + #pragma unroll + for (int layer_idx=0; layer_idx < NUM_LAYERS; layer_idx++) + { + + + int local_env_layer_idx = local_env_idx + layer_idx; + if (obb_enable[local_env_layer_idx] != 0) // disabled obstacle + { + + load_obb_pose(&obb_mat[(local_env_layer_idx) * 8], obb_pos, + obb_quat); + loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4]; + + transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere); + int voxel_layer_start_idx = local_env_layer_idx * num_voxels; + // check distance: + float4 cl; + compute_sphere_voxel_gradient(grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, loc_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + max_dist += cl.w; + if (transform_back) + { + inv_transform_vec_quat_add(obb_pos, obb_quat, cl, max_grad); + + } + } + } + } + } + else + { + + + + + for (int layer_idx=0; layer_idx < max_nobs; layer_idx++) + { + + + int local_env_layer_idx = local_env_idx + layer_idx; + if (obb_enable[local_env_layer_idx] != 0) // disabled obstacle + { + + load_obb_pose(&obb_mat[(local_env_layer_idx) * 8], obb_pos, + obb_quat); + loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4]; + + transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere); + int voxel_layer_start_idx = local_env_layer_idx * num_voxels; + // check distance: + float4 cl; + compute_sphere_voxel_gradient(grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, loc_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + max_dist += cl.w; + if (transform_back) + { + inv_transform_vec_quat_add(obb_pos, obb_quat, cl, max_grad); + + } + } + } + } + } + // sparsity opt: + if (max_dist == 0) + { + if (sparsity_idx[bn_sph_idx] == 0) + { + return; + } + sparsity_idx[bn_sph_idx] = 0; + + if (transform_back) + { + *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; // max_grad is all zeros + } + out_distance[bn_sph_idx] = 0.0; + return; + } + + // else max_dist != 0 + max_dist = weight[0] * max_dist; + + if (transform_back) + { + *(float3 *)&closest_pt[bn_sph_idx * 4] = weight[0] * max_grad; + } + out_distance[bn_sph_idx] = max_dist; + sparsity_idx[bn_sph_idx] = 1; + } + + + template + __device__ __forceinline__ void swept_sphere_voxel_distance_fn( + const scalar_t *sphere_position, + const int env_idx, + const int b_idx, + const int h_idx, + const int sph_idx, + dist_scalar_t *out_distance, + scalar_t *closest_pt, + uint8_t *sparsity_idx, + const float *weight, + const float *activation_distance, + const float *max_distance, + const float *speed_dt, + const grid_scalar_t *grid_features, + const float *grid_params, + const float *grid_pose, + const uint8_t *grid_enable, + const int max_nobs, + const int env_ngrid, + const int num_voxels, + const int batch_size, + const int horizon, + const int nspheres, + const int sweep_steps, + const bool transform_back) + { + const int sw_steps = sweep_steps; + const int start_box_idx = max_nobs * env_idx; + const int b_addrs = + b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx; + + // We read the same obstacles across + + // Load sphere_position input + // if h_idx == horizon -1, we just read the same index + const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx; + + + + + + float max_dist = 0.0; + float max_distance_local = max_distance[0]; + max_distance_local = -1 * max_distance_local; + const float eta = activation_distance[0]; + float3 max_grad = make_float3(0.0, 0.0, 0.0); + + // Load sphere_position input + float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4]; + + if (sphere_1_cache.w <= 0.0) + { + // write zeros for cost: + out_distance[bhs_idx] = 0; + + // write zeros for gradient if not zero: + if (sparsity_idx[bhs_idx] != 0) + { + sparsity_idx[bhs_idx] = 0; + *(float4 *)&closest_pt[bhs_idx * 4] = make_float4(0.0); + } + return; + } + sphere_1_cache.w += eta; + float4 loc_sphere_0, loc_sphere_2; + + bool sweep_back = false; + bool sweep_fwd = false; + float sphere_0_distance, sphere_2_distance, sphere_0_len, sphere_2_len; + + + const float dt = speed_dt[0]; + float4 sphere_0_cache = make_float4(0,0,0,0); + float4 sphere_2_cache = make_float4(0,0,0,0); + + if (h_idx > 0) + { + sphere_0_cache = + *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 + sph_idx * 4]; + sphere_0_cache.w = sphere_1_cache.w; + sphere_0_distance = sphere_distance(sphere_0_cache, sphere_1_cache); + sphere_0_len = sphere_0_distance + sphere_0_cache.w * 2; + + if (sphere_0_distance > 0.0) + { + sweep_back = true; + } + } + + if (h_idx < horizon - 1) + { + sphere_2_cache = + *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 + sph_idx * 4]; + sphere_2_cache.w = sphere_1_cache.w; + sphere_2_distance = sphere_distance(sphere_2_cache, sphere_1_cache); + sphere_2_len = sphere_2_distance + sphere_2_cache.w * 2; + + if (sphere_2_distance > 0.0) + { + sweep_fwd = true; + } + } + + float signed_distance = 0.0; + const int local_env_idx = max_nobs * env_idx; + + float4 loc_sphere = make_float4(0.0); + float4 obb_quat = make_float4(0.0); + float3 obb_pos = make_float3(0.0); + float4 loc_grid_params = make_float4(0.0); + float4 sum_grad = make_float4(0.0, 0.0, 0.0, 0.0); + float4 cl; + float jump_mid_distance = 0.0; + float k0, k1; + float temp_jump_distance = 0.0; + + if (NUM_LAYERS <= 4) + { + + + #pragma unroll + for (int layer_idx=0; layer_idx < NUM_LAYERS; layer_idx++) + { + float curr_jump_distance = 0.0; + + int local_env_layer_idx = local_env_idx + layer_idx; + sum_grad *= 0.0; + if (grid_enable[local_env_layer_idx] != 0) // disabled obstacle + { + + load_obb_pose(&grid_pose[(local_env_layer_idx) * 8], obb_pos, + obb_quat); + loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4]; + + transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere); + transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0); + transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2); + + int voxel_layer_start_idx = local_env_layer_idx * num_voxels; + // check distance: + compute_sphere_voxel_gradient(grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, loc_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + sum_grad += cl; + jump_mid_distance = signed_distance; + } + else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE) + { + jump_mid_distance = -1 * signed_distance; + } + + + jump_mid_distance = max(jump_mid_distance, loc_sphere.w); + curr_jump_distance = jump_mid_distance; + if (sweep_back && curr_jump_distance < sphere_0_distance/2) + { + for (int j=0; j= sphere_0_len/2) + { + break; + } + temp_jump_distance = 0.0; + k0 = 1 - (curr_jump_distance/sphere_0_len); + // compute collision + const float4 interpolated_sphere = + (k0)*loc_sphere + (1 - k0) * loc_sphere_0; + + compute_sphere_voxel_gradient( + grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, interpolated_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + sum_grad += cl; + temp_jump_distance = signed_distance; + } + else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE) + { + temp_jump_distance = -1 * signed_distance; + } + temp_jump_distance = max(temp_jump_distance, loc_sphere.w); + curr_jump_distance += temp_jump_distance; + + + } + } + curr_jump_distance = jump_mid_distance; + if (sweep_fwd && curr_jump_distance < sphere_2_distance/2) + { + for (int j=0; j= sphere_2_len/2) + { + break; + } + temp_jump_distance = 0.0; + k0 = 1 - (curr_jump_distance/sphere_2_len); + // compute collision + const float4 interpolated_sphere = + (k0)*loc_sphere + (1 - k0) * loc_sphere_2; + + compute_sphere_voxel_gradient( + grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, interpolated_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + sum_grad += cl; + temp_jump_distance = signed_distance; + } + else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE) + { + temp_jump_distance = -1 * signed_distance; + } + temp_jump_distance = max(temp_jump_distance, loc_sphere.w); + curr_jump_distance += temp_jump_distance; + + } + } + if (sum_grad.w > 0.0 ) + { + max_dist += sum_grad.w; + if (transform_back) + { + inv_transform_vec_quat_add(obb_pos, obb_quat, sum_grad, max_grad); + } + + } + + + + + } + } + } + else + { + + + + for (int layer_idx=0; layer_idx < max_nobs; layer_idx++) + { + float curr_jump_distance = 0.0; + + int local_env_layer_idx = local_env_idx + layer_idx; + sum_grad *= 0.0; + if (grid_enable[local_env_layer_idx] != 0) // disabled obstacle + { + + load_obb_pose(&grid_pose[(local_env_layer_idx) * 8], obb_pos, + obb_quat); + loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4]; + + transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere); + transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0); + transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2); + + int voxel_layer_start_idx = local_env_layer_idx * num_voxels; + // check distance: + compute_sphere_voxel_gradient(grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, loc_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + sum_grad += cl; + jump_mid_distance = signed_distance; + } + else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE) + { + jump_mid_distance = -1 * signed_distance; + } + + + jump_mid_distance = max(jump_mid_distance, loc_sphere.w); + curr_jump_distance = jump_mid_distance; + if (sweep_back && curr_jump_distance < sphere_0_distance/2) + { + for (int j=0; j= sphere_0_len/2) + { + break; + } + temp_jump_distance = 0.0; + k0 = 1 - (curr_jump_distance/sphere_0_len); + // compute collision + const float4 interpolated_sphere = + (k0)*loc_sphere + (1 - k0) * loc_sphere_0; + + compute_sphere_voxel_gradient( + grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, interpolated_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + sum_grad += cl; + temp_jump_distance = signed_distance; + } + else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE) + { + temp_jump_distance = -1 * signed_distance; + } + temp_jump_distance = max(temp_jump_distance, loc_sphere.w); + curr_jump_distance += temp_jump_distance; + + + } + } + curr_jump_distance = jump_mid_distance; + if (sweep_fwd && curr_jump_distance < sphere_2_distance/2) + { + for (int j=0; j= sphere_2_len/2) + { + break; + } + temp_jump_distance = 0.0; + k0 = 1 - (curr_jump_distance/sphere_2_len); + // compute collision + const float4 interpolated_sphere = + (k0)*loc_sphere + (1 - k0) * loc_sphere_2; + + compute_sphere_voxel_gradient( + grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, interpolated_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>0.0) + { + sum_grad += cl; + temp_jump_distance = signed_distance; + } + else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE) + { + temp_jump_distance = -1 * signed_distance; + } + temp_jump_distance = max(temp_jump_distance, loc_sphere.w); + curr_jump_distance += temp_jump_distance; + + } + } + if (sum_grad.w > 0.0 ) + { + max_dist += sum_grad.w; + if (transform_back) + { + inv_transform_vec_quat_add(obb_pos, obb_quat, sum_grad, max_grad); + } + + } + + + + + } + } + } + // sparsity opt: + if (max_dist == 0) + { + if (sparsity_idx[bhs_idx] == 0) + { + return; + } + sparsity_idx[bhs_idx] = 0; + + if (transform_back) + { + *(float3 *)&closest_pt[bhs_idx * 4] = max_grad; // max_grad is all zeros + } + out_distance[bhs_idx] = 0.0; + return; + } + + // computer speed metric here: + if (ENABLE_SPEED_METRIC) + { + if (sweep_back && sweep_fwd) + { + scale_speed_metric(sphere_0_cache, sphere_1_cache, sphere_2_cache, dt, + transform_back, max_dist, max_grad); + } + } + // else max_dist != 0 + max_dist = weight[0] * max_dist; + + if (transform_back) + { + *(float3 *)&closest_pt[bhs_idx * 4] = weight[0] * max_grad; + } + out_distance[bhs_idx] = max_dist; + sparsity_idx[bhs_idx] = 1; + + } + + + template + __global__ void swept_sphere_voxel_distance_jump_kernel( + const scalar_t *sphere_position, + dist_scalar_t *out_distance, + scalar_t *closest_pt, uint8_t *sparsity_idx, const float *weight, + const float *activation_distance, + const float *max_distance, + const float *speed_dt, + const grid_scalar_t *grid_features, const float *grid_params, + const float *grid_pose, const uint8_t *grid_enable, + const int32_t *n_env_grid, const int32_t *env_query_idx, const int max_nobs, + const int num_voxels, + const int batch_size, const int horizon, const int nspheres, + const int sweep_steps, + const bool transform_back) + { + const int t_idx = blockIdx.x * blockDim.x + threadIdx.x; + const int b_idx = t_idx / (horizon * nspheres); + + const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres; + const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres); + + if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon)) + { + return; + } + + int env_idx = 0; + + if (BATCH_ENV_T) + { + env_idx = env_query_idx[b_idx]; + } + + const int env_nboxes = n_env_grid[env_idx]; + swept_sphere_voxel_distance_fn( + sphere_position, env_idx, b_idx, h_idx, sph_idx, + out_distance, closest_pt, + sparsity_idx, weight, activation_distance, max_distance, + speed_dt, + grid_features, + grid_params, grid_pose, grid_enable, max_nobs, env_nboxes, num_voxels, batch_size, + horizon, nspheres, sweep_steps, transform_back); + } + + template + __device__ __forceinline__ void sphere_voxel_esdf_fn( + const geom_scalar_t *sphere_position, + const int32_t env_idx, + const int bn_sph_idx, + const int sph_idx, + dist_scalar_t *out_distance, grad_scalar_t *closest_pt, uint8_t *sparsity_idx, + const float *weight, + const float *activation_distance, + const float *max_distance, + const grid_scalar_t *grid_features, + const float *grid_params, + const float *obb_mat, + const uint8_t *obb_enable, + const int max_nobs, + const int num_voxels, + const bool transform_back) + { + float max_distance_local = max_distance[0]; + const float eta = max_distance_local; + float max_dist = -1 * max_distance_local; + max_distance_local = -1 * max_distance_local; +; + + float3 max_grad = make_float3(0.0, 0.0, 0.0); + + // Load sphere_position input + float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4]; + + if (sphere_cache.w <= 0.0) + { + // write zeros for cost: + out_distance[bn_sph_idx] = 0; + + // write zeros for gradient if not zero: + if (sparsity_idx[bn_sph_idx] != 0) + { + sparsity_idx[bn_sph_idx] = 0; + *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0); + } + return; + } + const float sphere_radius = sphere_cache.w; + + sphere_cache.w += eta; + const int local_env_idx = max_nobs * env_idx; + + float4 loc_sphere = make_float4(0.0); + float4 obb_quat = make_float4(0.0); + float3 obb_pos = make_float3(0.0); + float4 loc_grid_params = make_float4(0.0); + + float signed_distance = 0; + + for (int layer_idx=0; layer_idx < max_nobs; layer_idx++) + { + + + int local_env_layer_idx = local_env_idx + layer_idx; + if (obb_enable[local_env_layer_idx] != 0) // disabled obstacle + { + + load_obb_pose(&obb_mat[(local_env_layer_idx) * 8], obb_pos, + obb_quat); + loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4]; + + transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere); + int voxel_layer_start_idx = local_env_layer_idx * num_voxels; + // check distance: + float4 cl; + + compute_sphere_voxel_gradient(grid_features, + voxel_layer_start_idx, num_voxels, + loc_grid_params, loc_sphere, cl, signed_distance, eta, + max_distance_local, transform_back); + if (cl.w>max_dist) + { + max_dist = cl.w; + if (transform_back) + { + inv_transform_vec_quat(obb_pos, obb_quat, cl, max_grad); + + } + } + } + } + + + + + max_dist = max_dist - sphere_radius; + if (transform_back) + { + *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; + } + out_distance[bn_sph_idx] = max_dist; + + } + + + template __device__ __forceinline__ void swept_sphere_obb_distance_fn( const scalar_t *sphere_position, const int env_idx, const int b_idx, const int h_idx, const int sph_idx, - scalar_t *out_distance, + dist_scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx, - const scalar_t *weight, - const scalar_t *activation_distance, const scalar_t *speed_dt, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_mat, + const float *weight, + const float *activation_distance, const float *speed_dt, + const float *obb_accel, const float *obb_bounds, + const float *obb_mat, const uint8_t *obb_enable, const int max_nobs, const int nboxes, const int batch_size, const int horizon, @@ -789,9 +1985,13 @@ namespace Curobo // warp wide reductions should only happen across nspheres in same batch and horizon // // extern __shared__ float psum[]; + int sw_steps = SWEEP_STEPS; + const float max_distance = 1000.0; - const int sw_steps = sweep_steps; - const float fl_sw_steps = sw_steps; + if (SWEEP_STEPS == -1) + { + sw_steps = sweep_steps; + } const int start_box_idx = max_nobs * env_idx; const int b_addrs = b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx; @@ -829,8 +2029,13 @@ namespace Curobo const float dt = speed_dt[0]; const float eta = activation_distance[0]; sphere_1_cache.w += eta; - float4 sphere_0_cache, sphere_2_cache; - + float4 sphere_0_cache = make_float4(0,0,0,0); + float4 sphere_2_cache = make_float4(0,0,0,0); + float4 loc_grad = make_float4(0,0,0,0); + bool inside = false; + float distance = 0.0; + float sph_dist = 0.0; + float3 delta = make_float3(0,0,0); if (h_idx > 0) { sphere_0_cache = @@ -880,39 +2085,45 @@ namespace Curobo load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds); float curr_jump_distance = 0.0; - const float3 grad_loc_bounds = loc_bounds + sphere_1_cache.w; // assuming sphere radius + //const float3 grad_loc_bounds = loc_bounds + sphere_1_cache.w; // assuming sphere radius // doesn't change transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere_1); + transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0); + transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2); // assuming sphere position is in box frame: // read data: float4 sum_pt = make_float4(0.0, 0.0, 0.0, 0.0); - + curr_jump_distance = 0.0; // check at exact timestep: - if (check_sphere_aabb(loc_bounds, loc_sphere_1)) + if (check_sphere_aabb(loc_bounds, loc_sphere_1, inside, delta, curr_jump_distance, sph_dist)) { - compute_sphere_gradient(grad_loc_bounds, loc_sphere_1, sum_pt, eta); - curr_jump_distance = loc_sphere_1.w; // TODO: check if this is better + + scale_eta_metric(delta, sph_dist, eta, sum_pt); } else if (sweep_back || sweep_fwd) { // there is no collision, compute the distance to obstacle: - curr_jump_distance = compute_distance(grad_loc_bounds, loc_sphere_1) - - loc_sphere_1.w; // - eta; + curr_jump_distance = compute_distance_fn(loc_bounds, loc_sphere_1, + max_distance, delta, sph_dist, distance, inside); + } + curr_jump_distance = fabsf(curr_jump_distance) - loc_sphere_1.w; + curr_jump_distance = max(curr_jump_distance, loc_sphere_1.w); + const float jump_mid_distance = curr_jump_distance; // compute distance between sweep spheres: if (sweep_back && (jump_mid_distance < sphere_0_distance / 2)) { - transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0); // get unit vector: // loc_sphere_0 = (loc_sphere_0 - loc_sphere_1)/(sphere_0_len); // loop over sweep steps and accumulate distance: + #pragma unroll for (int j = 0; j < sw_steps; j++) { // jump by current jump distance: @@ -925,17 +2136,25 @@ namespace Curobo break; } k0 = 1 - (curr_jump_distance / sphere_0_len); - check_jump_distance(loc_sphere_1, loc_sphere_0, k0, eta, loc_bounds, - grad_loc_bounds, sum_pt, curr_jump_distance); + check_jump_distance(loc_sphere_1, loc_sphere_0, + k0, + loc_bounds, + max_distance, + delta, + sph_dist, + distance, + inside, + eta, + sum_pt, + curr_jump_distance); } } if (sweep_fwd && (jump_mid_distance < (sphere_2_len / 2))) { curr_jump_distance = jump_mid_distance; - transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2); - + #pragma unroll for (int j = 0; j < sw_steps; j++) { if (curr_jump_distance >= (sphere_2_len / 2)) @@ -943,23 +2162,54 @@ namespace Curobo break; } k0 = 1 - curr_jump_distance / sphere_2_len; - check_jump_distance(loc_sphere_1, loc_sphere_2, k0, eta, loc_bounds, - grad_loc_bounds, sum_pt, curr_jump_distance); + check_jump_distance(loc_sphere_1, loc_sphere_2, + k0, + loc_bounds, + max_distance, + delta, + sph_dist, + distance, + inside, + eta, + sum_pt, + curr_jump_distance); } } - - if (sum_pt.w > 0) + if (SUM_COLLISIONS) + { + if (sum_pt.w > 0) // max_dist starts at 0 { max_dist += sum_pt.w; // transform point back if required: if (transform_back) { - inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad); + //inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad); + inv_transform_vec_quat_add(obb_pos, obb_quat, sum_pt, max_grad); + } // break;// break after first obstacle collision } + } + else + { + + if (sum_pt.w > max_dist) // max_dist starts at 0 + { + max_dist = sum_pt.w; + + // transform point back if required: + if (transform_back) + { + inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad); + //inv_transform_vec_quat_add(obb_pos, obb_quat, sum_pt, max_grad); + + } + + // break;// break after first obstacle collision + } + } } @@ -981,7 +2231,7 @@ namespace Curobo } // computer speed metric here: - if (enable_speed_metric) + if (ENABLE_SPEED_METRIC) { if (sweep_back && sweep_fwd) { @@ -999,6 +2249,7 @@ namespace Curobo out_distance[bhs_idx] = max_dist; } + /** * @brief Swept Collision checking. Note: This function currently does not @@ -1026,14 +2277,14 @@ namespace Curobo * @param sweep_steps * @return __device__ */ - template + template __device__ __forceinline__ void swept_sphere_obb_collision_fn( const scalar_t *sphere_position, const int env_idx, const int b_idx, - const int h_idx, const int sph_idx, scalar_t *out_distance, - const scalar_t *weight, const scalar_t *activation_distance, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs, + const int h_idx, const int sph_idx, dist_scalar_t *out_distance, + const float *weight, const float *activation_distance, + const float *obb_accel, const float *obb_bounds, + const float *obb_mat, const uint8_t *obb_enable, const int max_nobs, const int nboxes, const int batch_size, const int horizon, const int nspheres, const int sweep_steps) { @@ -1045,7 +2296,12 @@ namespace Curobo b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx; const int start_box_idx = max_nobs * env_idx; const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx; - + float4 loc_grad = make_float4(0,0,0,0); + float3 delta = make_float3(0,0,0); + bool inside = false; + float curr_jump_distance = 0.0; + float distance = 0.0; + float sph_dist = 0.0; // We read the same obstacles across // Load sphere_position input @@ -1101,11 +2357,14 @@ namespace Curobo transform_sphere_quat(&in_obb_mat[0], sphere_1_cache, loc_sphere_1); max_dist += box_idx; - - if (check_sphere_aabb(loc_bounds, loc_sphere_1)) + + if (check_sphere_aabb(loc_bounds, loc_sphere_1, inside, delta, curr_jump_distance, sph_dist)) { - max_dist = 1; + + max_dist = 1; + break; } + if (h_idx > 0) { @@ -1120,10 +2379,12 @@ namespace Curobo k1 = 1 - k0; interpolated_sphere = k0 * loc_sphere_1 + (k1) * loc_sphere_0; - if (check_sphere_aabb(loc_bounds, interpolated_sphere)) + if (check_sphere_aabb(loc_bounds, interpolated_sphere, inside, delta, curr_jump_distance, sph_dist)) { - max_dist = 1; - break; + + max_dist = 1; + break; + } } } @@ -1139,8 +2400,7 @@ namespace Curobo k0 = (j + 1) / (fl_sw_steps); k1 = 1 - k0; interpolated_sphere = k0 * loc_sphere_1 + (k1) * loc_sphere_2; - - if (check_sphere_aabb(loc_bounds, interpolated_sphere)) + if (check_sphere_aabb(loc_bounds, interpolated_sphere, inside, delta, curr_jump_distance, sph_dist)) { max_dist = 1; break; @@ -1156,13 +2416,15 @@ namespace Curobo out_distance[b_addrs + h_idx * nspheres + sph_idx] = weight[0] * max_dist; } - template + template __global__ void sphere_obb_distance_kernel( const scalar_t *sphere_position, - scalar_t *out_distance, - scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight, - const scalar_t *activation_distance, const scalar_t *obb_accel, - const scalar_t *obb_bounds, const scalar_t *obb_mat, + dist_scalar_t *out_distance, + scalar_t *closest_pt, uint8_t *sparsity_idx, const float *weight, + const float *activation_distance, + const float *max_distance, + const float *obb_accel, + const float *obb_bounds, const float *obb_mat, const uint8_t *obb_enable, const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs, @@ -1185,40 +2447,56 @@ namespace Curobo b_idx * horizon * nspheres + h_idx * nspheres + sph_idx; int env_idx = 0; - int env_nboxes = n_env_obb[0]; - if (batch_env_t) + if (BATCH_ENV_T) { env_idx = env_query_idx[b_idx]; // read env idx from current batch idx - env_nboxes = - n_env_obb[env_idx]; // read nboxes in current environment + } + const int env_nboxes = n_env_obb[env_idx]; // read nboxes in current environment + if (COMPUTE_ESDF) + { + sphere_obb_esdf_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance, + closest_pt, sparsity_idx, weight, activation_distance, max_distance, + obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs, + env_nboxes, transform_back); - sphere_obb_distance_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance, + } + else + { + sphere_obb_distance_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance, closest_pt, sparsity_idx, weight, activation_distance, obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs, env_nboxes, transform_back); + } + // return the sphere distance here: // sync threads and do block level reduction: } - template - __global__ void swept_sphere_obb_distance_jump_kernel( - const scalar_t *sphere_position, - scalar_t *out_distance, - scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight, - const scalar_t *activation_distance, const scalar_t *speed_dt, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_pose, const uint8_t *obb_enable, - const int32_t *n_env_obb, const int max_nobs, const int batch_size, - const int horizon, const int nspheres, const int sweep_steps, const bool transform_back) - { - // This kernel jumps by sdf to only get gradients at collision points. + template + __global__ void sphere_voxel_distance_kernel( + const geom_scalar_t *sphere_position, + dist_scalar_t *out_distance, + grad_scalar_t *closest_pt, + uint8_t *sparsity_idx, + const float *weight, + const float *activation_distance, + const float *max_distance, + const grid_scalar_t *grid_features, + const float *grid_params, const float *obb_mat, + const uint8_t *obb_enable, const int32_t *n_env_obb, + const int32_t *env_query_idx, + const int max_nobs, const int num_voxels, + const int batch_size, const int horizon, const int nspheres, + const bool transform_back) + { // spheres_per_block is number of spheres in a thread // compute local sphere batch by first dividing threadidx/nboxes + // const int sph_idx = blockIdx.x * blockDim.x + threadIdx.x; const int t_idx = blockIdx.x * blockDim.x + threadIdx.x; const int b_idx = t_idx / (horizon * nspheres); const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres; @@ -1228,26 +2506,49 @@ namespace Curobo { return; } + const int bn_sph_idx = + b_idx * horizon * nspheres + h_idx * nspheres + sph_idx; - const int env_idx = 0; + int env_idx = 0; + int env_nboxes = n_env_obb[0]; + if (BATCH_ENV_T) + { + env_idx = + env_query_idx[b_idx]; // read env idx from current batch idx + env_nboxes = + n_env_obb[env_idx]; // read nboxes in current environment + } + if (COMPUTE_ESDF) + { + + sphere_voxel_esdf_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance, + closest_pt, sparsity_idx, weight, activation_distance, max_distance, + grid_features, grid_params, obb_mat, obb_enable, max_nobs, num_voxels, + transform_back); - const int env_nboxes = n_env_obb[env_idx]; - swept_sphere_obb_distance_fn( - sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt, - sparsity_idx, weight, activation_distance, speed_dt, obb_accel, - obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size, - horizon, nspheres, sweep_steps, transform_back); + } + else + { + sphere_voxel_distance_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance, + closest_pt, sparsity_idx, weight, activation_distance, max_distance, + grid_features, grid_params, obb_mat, obb_enable, max_nobs, num_voxels, + transform_back); + } + + // return the sphere distance here: + // sync threads and do block level reduction: } - template - __global__ void swept_sphere_obb_distance_jump_batch_env_kernel( + + template + __global__ void swept_sphere_obb_distance_jump_kernel( const scalar_t *sphere_position, - scalar_t *out_distance, - scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight, - const scalar_t *activation_distance, const scalar_t *speed_dt, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_pose, const uint8_t *obb_enable, + dist_scalar_t *out_distance, + scalar_t *closest_pt, uint8_t *sparsity_idx, const float *weight, + const float *activation_distance, const float *speed_dt, + const float *obb_accel, const float *obb_bounds, + const float *obb_pose, const uint8_t *obb_enable, const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs, const int batch_size, const int horizon, const int nspheres, const int sweep_steps, @@ -1270,23 +2571,29 @@ namespace Curobo return; } - const int env_idx = env_query_idx[b_idx]; + int env_idx = 0; + + if (BATCH_ENV_T) + { + env_idx = env_query_idx[b_idx]; + } + const int env_nboxes = n_env_obb[env_idx]; - swept_sphere_obb_distance_fn( + swept_sphere_obb_distance_fn( sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt, sparsity_idx, weight, activation_distance, speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps, transform_back); } - template + template __global__ void swept_sphere_obb_collision_kernel( const scalar_t *sphere_position, - scalar_t *out_distance, - const scalar_t *weight, const scalar_t *activation_distance, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_pose, const uint8_t *obb_enable, + dist_scalar_t *out_distance, + const float *weight, const float *activation_distance, + const float *obb_accel, const float *obb_bounds, + const float *obb_pose, const uint8_t *obb_enable, const int32_t *n_env_obb, const int max_nobs, const int batch_size, const int horizon, const int nspheres, const int sweep_steps) { @@ -1311,13 +2618,13 @@ namespace Curobo max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps); } - template + template __global__ void swept_sphere_obb_collision_batch_env_kernel( const scalar_t *sphere_position, - scalar_t *out_distance, - const scalar_t *weight, const scalar_t *activation_distance, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_pose, const uint8_t *obb_enable, + dist_scalar_t *out_distance, + const float *weight, const float *activation_distance, + const float *obb_accel, const float *obb_bounds, + const float *obb_pose, const uint8_t *obb_enable, const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs, const int batch_size, const int horizon, const int nspheres, const int sweep_steps) @@ -1343,44 +2650,14 @@ namespace Curobo max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps); } - template - __global__ void sphere_obb_collision_kernel( - const scalar_t *sphere_position, - scalar_t *out_distance, - const scalar_t *weight, const scalar_t *activation_distance, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_mat, const uint8_t *obb_enable, - const int32_t *n_env_obb, const int max_nobs, const int batch_size, - const int horizon, const int nspheres) - { - // spheres_per_block is number of spheres in a thread - // compute local sphere batch by first dividing threadidx/nboxes - const int t_idx = blockIdx.x * blockDim.x + threadIdx.x; - const int b_idx = t_idx / (horizon * nspheres); - const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres; - const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres); - if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon)) - { - return; - } - const int env_idx = 0; - const int env_nboxes = n_env_obb[env_idx]; - const int bn_sph_idx = - b_idx * horizon * nspheres + h_idx * nspheres + sph_idx; - sphere_obb_collision_fn(sphere_position, - env_idx, bn_sph_idx, sph_idx, out_distance, - weight, activation_distance, obb_accel, obb_bounds, - obb_mat, obb_enable, max_nobs, env_nboxes); - } - - template + template __global__ void sphere_obb_collision_batch_env_kernel( const scalar_t *sphere_position, - scalar_t *out_distance, - const scalar_t *weight, const scalar_t *activation_distance, - const scalar_t *obb_accel, const scalar_t *obb_bounds, - const scalar_t *obb_mat, const uint8_t *obb_enable, + dist_scalar_t *out_distance, + const float *weight, const float *activation_distance, + const float *obb_accel, const float *obb_bounds, + const float *obb_mat, const uint8_t *obb_enable, const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs, const int batch_size, const int horizon, const int nspheres) { @@ -1395,8 +2672,14 @@ namespace Curobo { return; } - const int env_idx = env_query_idx[b_idx]; + int env_idx = 0; + if(BATCH_ENV_T) + { + env_idx = env_query_idx[b_idx]; + } + const int env_nboxes = n_env_obb[env_idx]; + const int bn_sph_idx = b_idx * horizon * nspheres + h_idx * nspheres + sph_idx; sphere_obb_collision_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance, @@ -1413,6 +2696,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3 torch::Tensor closest_point, // batch size, 3 torch::Tensor sparsity_idx, const torch::Tensor weight, const torch::Tensor activation_distance, + const torch::Tensor max_distance, const torch::Tensor obb_accel, // n_boxes, 4, 4 const torch::Tensor obb_bounds, // n_boxes, 3 const torch::Tensor obb_pose, // n_boxes, 4, 4 @@ -1421,13 +2705,16 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3 const torch::Tensor env_query_idx, // n_boxes, 4, 4 const int max_nobs, const int batch_size, const int horizon, const int n_spheres, const bool transform_back, - const bool compute_distance, const bool use_batch_env) + const bool compute_distance, const bool use_batch_env, + const bool sum_collisions, + const bool compute_esdf) { using namespace Curobo::Geometry; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const int bnh_spheres = n_spheres * batch_size * horizon; // - + const bool scale_metric = true; int threadsPerBlock = bnh_spheres; + const bool sum_collisions_ = true; if (threadsPerBlock > 128) { @@ -1435,93 +2722,85 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3 } int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock; - if (use_batch_env) + if (!compute_distance) { - if (compute_distance) + + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "SphereObb_clpt_collision", ([&]{ + auto collision_kernel = sphere_obb_collision_batch_env_kernel; + auto batch_collision_kernel = sphere_obb_collision_batch_env_kernel; + auto selected_k = collision_kernel; + if (use_batch_env) { - // TODO: call kernel based on flag: - AT_DISPATCH_FLOATING_TYPES( - distance.scalar_type(), "SphereObb_clpt", ([&] { - sphere_obb_distance_kernel - << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( - sphere_position.data_ptr(), - distance.data_ptr(), - closest_point.data_ptr(), - sparsity_idx.data_ptr(), - weight.data_ptr(), - activation_distance.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), - obb_enable.data_ptr(), - n_env_obb.data_ptr(), - env_query_idx.data_ptr(), max_nobs, batch_size, - horizon, n_spheres, transform_back); - })); + selected_k = batch_collision_kernel; } - else - { - // TODO: call kernel based on flag: - AT_DISPATCH_FLOATING_TYPES( - distance.scalar_type(), "SphereObb_collision", ([&] { - sphere_obb_collision_batch_env_kernel - << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + + selected_k<< < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), weight.data_ptr(), - activation_distance.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + distance.data_ptr(), weight.data_ptr(), + activation_distance.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), n_env_obb.data_ptr(), env_query_idx.data_ptr(), max_nobs, batch_size, horizon, n_spheres); - })); - } + })); + } else { - if (compute_distance) + + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2 + const auto fp8_type = torch::kFloat8_e4m3fn; + #else + const auto fp8_type = torch::kHalf; + #endif + // typename scalar_t, typename dist_scalar_t=float, bool BATCH_ENV_T=true, bool SCALE_METRIC=true, bool SUM_COLLISIONS=true, bool COMPUTE_ESDF=false + AT_DISPATCH_FLOATING_TYPES_AND2(torch::kBFloat16, fp8_type, + distance.scalar_type(), "SphereObb_clpt", ([&]{ + auto distance_kernel = sphere_obb_distance_kernel; + if (use_batch_env) { - // TODO: call kernel based on flag: - AT_DISPATCH_FLOATING_TYPES( - distance.scalar_type(), "SphereObb_clpt", ([&] { - sphere_obb_distance_kernel - << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + if (compute_esdf) + { + distance_kernel = sphere_obb_distance_kernel; + + } + else + { + + distance_kernel = sphere_obb_distance_kernel; + } + + } + else if (compute_esdf) + { + distance_kernel = sphere_obb_distance_kernel; + } + + distance_kernel<< < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), distance.data_ptr(), closest_point.data_ptr(), sparsity_idx.data_ptr(), - weight.data_ptr(), - activation_distance.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + max_distance.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), n_env_obb.data_ptr(), - env_query_idx.data_ptr(), max_nobs, batch_size, + env_query_idx.data_ptr(), + max_nobs, batch_size, horizon, n_spheres, transform_back); - })); - } - else - { - // TODO: call kernel based on flag: - AT_DISPATCH_FLOATING_TYPES( - distance.scalar_type(), "SphereObb_collision", ([&] { - sphere_obb_collision_kernel - << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( - sphere_position.data_ptr(), - distance.data_ptr(), weight.data_ptr(), - activation_distance.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), - obb_enable.data_ptr(), - n_env_obb.data_ptr(), max_nobs, batch_size, - horizon, n_spheres); - })); - } + + + })); } + C10_CUDA_KERNEL_LAUNCH_CHECK(); return { distance, closest_point, sparsity_idx }; @@ -1534,7 +2813,8 @@ std::vectorswept_sphere_obb_clpt( torch::Tensor closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient torch::Tensor sparsity_idx, const torch::Tensor weight, - const torch::Tensor activation_distance, const torch::Tensor speed_dt, + const torch::Tensor activation_distance, + const torch::Tensor speed_dt, const torch::Tensor obb_accel, // n_boxes, 4, 4 const torch::Tensor obb_bounds, // n_boxes, 3 const torch::Tensor obb_pose, // n_boxes, 4, 4 @@ -1544,11 +2824,11 @@ std::vectorswept_sphere_obb_clpt( const int max_nobs, const int batch_size, const int horizon, const int n_spheres, const int sweep_steps, const bool enable_speed_metric, const bool transform_back, const bool compute_distance, - const bool use_batch_env) + const bool use_batch_env, + const bool sum_collisions) { using namespace Curobo::Geometry; - - + // const int max_batches_per_block = 128; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -1567,7 +2847,10 @@ std::vectorswept_sphere_obb_clpt( } int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock; - + if (sum_collisions) + { + const bool sum_collisions_ = true; + if (use_batch_env) { if (compute_distance) @@ -1577,18 +2860,18 @@ std::vectorswept_sphere_obb_clpt( // This is the best kernel for now AT_DISPATCH_FLOATING_TYPES( distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { - swept_sphere_obb_distance_jump_batch_env_kernel + swept_sphere_obb_distance_jump_kernel << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), + distance.data_ptr(), closest_point.data_ptr(), sparsity_idx.data_ptr(), - weight.data_ptr(), - activation_distance.data_ptr(), - speed_dt.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), n_env_obb.data_ptr(), env_query_idx.data_ptr(), max_nobs, batch_size, @@ -1598,21 +2881,20 @@ std::vectorswept_sphere_obb_clpt( } else { - // This is the best kernel for now AT_DISPATCH_FLOATING_TYPES( distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { - swept_sphere_obb_distance_jump_batch_env_kernel + swept_sphere_obb_distance_jump_kernel << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), + distance.data_ptr(), closest_point.data_ptr(), sparsity_idx.data_ptr(), - weight.data_ptr(), - activation_distance.data_ptr(), - speed_dt.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), n_env_obb.data_ptr(), env_query_idx.data_ptr(), max_nobs, batch_size, @@ -1628,14 +2910,191 @@ std::vectorswept_sphere_obb_clpt( // TODO: call kernel based on flag: AT_DISPATCH_FLOATING_TYPES( distance.scalar_type(), "SphereObb_collision", ([&] { - swept_sphere_obb_collision_batch_env_kernel + swept_sphere_obb_collision_batch_env_kernel << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), weight.data_ptr(), - activation_distance.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + distance.data_ptr(), weight.data_ptr(), + activation_distance.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps); + })); + } + } + else + { + if (compute_distance) + { + if (enable_speed_metric) + { + if (sweep_steps == 4) + { + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { + swept_sphere_obb_distance_jump_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps, + transform_back); + })); + } + else + { + + + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { + swept_sphere_obb_distance_jump_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps, + transform_back); + })); + } + } + else + { + // This is the best kernel for now + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { + swept_sphere_obb_distance_jump_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps, + transform_back); + })); + } + } + else + { + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "SphereObb_collision", ([&] { + swept_sphere_obb_collision_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), weight.data_ptr(), + activation_distance.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps); + })); + } + } + } + else + { + const bool sum_collisions_ = true; + if (use_batch_env) + { + if (compute_distance) + { + if (enable_speed_metric) + { + // This is the best kernel for now + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { + swept_sphere_obb_distance_jump_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps, + transform_back); + })); + } + else + { + // This is the best kernel for now + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { + swept_sphere_obb_distance_jump_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, + horizon, n_spheres, sweep_steps, + transform_back); + })); + } + } + else + { + // TODO: implement this later + + // TODO: call kernel based on flag: + AT_DISPATCH_FLOATING_TYPES( + distance.scalar_type(), "SphereObb_collision", ([&] { + swept_sphere_obb_collision_batch_env_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), weight.data_ptr(), + activation_distance.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), n_env_obb.data_ptr(), env_query_idx.data_ptr(), max_nobs, batch_size, @@ -1652,20 +3111,21 @@ std::vectorswept_sphere_obb_clpt( // This is the best kernel for now AT_DISPATCH_FLOATING_TYPES( distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { - swept_sphere_obb_distance_jump_kernel + swept_sphere_obb_distance_jump_kernel << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), + distance.data_ptr(), closest_point.data_ptr(), sparsity_idx.data_ptr(), - weight.data_ptr(), - activation_distance.data_ptr(), - speed_dt.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), - n_env_obb.data_ptr(), max_nobs, batch_size, + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, horizon, n_spheres, sweep_steps, transform_back); })); @@ -1675,20 +3135,21 @@ std::vectorswept_sphere_obb_clpt( // This is the best kernel for now AT_DISPATCH_FLOATING_TYPES( distance.scalar_type(), "Swept_SphereObb_clpt", ([&] { - swept_sphere_obb_distance_jump_kernel + swept_sphere_obb_distance_jump_kernel << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), + distance.data_ptr(), closest_point.data_ptr(), sparsity_idx.data_ptr(), - weight.data_ptr(), - activation_distance.data_ptr(), - speed_dt.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + speed_dt.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), - n_env_obb.data_ptr(), max_nobs, batch_size, + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), max_nobs, batch_size, horizon, n_spheres, sweep_steps, transform_back); })); @@ -1698,14 +3159,14 @@ std::vectorswept_sphere_obb_clpt( { AT_DISPATCH_FLOATING_TYPES( distance.scalar_type(), "SphereObb_collision", ([&] { - swept_sphere_obb_collision_kernel + swept_sphere_obb_collision_kernel << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( sphere_position.data_ptr(), - distance.data_ptr(), weight.data_ptr(), - activation_distance.data_ptr(), - obb_accel.data_ptr(), - obb_bounds.data_ptr(), - obb_pose.data_ptr(), + distance.data_ptr(), weight.data_ptr(), + activation_distance.data_ptr(), + obb_accel.data_ptr(), + obb_bounds.data_ptr(), + obb_pose.data_ptr(), obb_enable.data_ptr(), n_env_obb.data_ptr(), max_nobs, batch_size, horizon, n_spheres, sweep_steps); @@ -1713,7 +3174,241 @@ std::vectorswept_sphere_obb_clpt( } } + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); return { distance, closest_point, sparsity_idx }; // , debug_data}; } + + +std::vector +sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3 + torch::Tensor distance, + torch::Tensor closest_point, // batch size, 3 + torch::Tensor sparsity_idx, const torch::Tensor weight, + const torch::Tensor activation_distance, + const torch::Tensor max_distance, + const torch::Tensor grid_features, // n_boxes, 4, 4 + const torch::Tensor grid_params, // n_boxes, 3 + const torch::Tensor obb_pose, // n_boxes, 4, 4 + const torch::Tensor obb_enable, // n_boxes, 4, 4 + const torch::Tensor n_env_obb, + const torch::Tensor env_query_idx, // n_boxes, 4, 4 + const int max_nobs, const int batch_size, const int horizon, + const int n_spheres, const bool transform_back, + const bool compute_distance, const bool use_batch_env, + const bool sum_collisions, + const bool compute_esdf) +{ + using namespace Curobo::Geometry; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const int bnh_spheres = n_spheres * batch_size * horizon; // + const bool scale_metric = true; + const int num_voxels = grid_features.size(-2); + int threadsPerBlock = bnh_spheres; + + if (threadsPerBlock > 128) + { + threadsPerBlock = 128; + } + // bfloat16 + //ScalarType::Float8_e4m3fn + + int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock; + + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2 + const auto fp8_type = torch::kFloat8_e4m3fn; + #else + const auto fp8_type = torch::kHalf; + #endif + + AT_DISPATCH_FLOATING_TYPES_AND2(torch::kBFloat16, fp8_type, + grid_features.scalar_type(), "SphereVoxel_clpt", ([&] + { + + auto kernel_esdf = sphere_voxel_distance_kernel; + auto kernel_distance_1 = sphere_voxel_distance_kernel; + auto kernel_distance_2 = sphere_voxel_distance_kernel; + auto kernel_distance_3 = sphere_voxel_distance_kernel; + auto kernel_distance_4 = sphere_voxel_distance_kernel; + + auto kernel_distance_n = sphere_voxel_distance_kernel; + auto selected_kernel = kernel_distance_n; + if (compute_esdf) + { + selected_kernel = kernel_esdf; + + } + else + { + switch (max_nobs){ + case 1: + selected_kernel = kernel_distance_1; + break; + case 2: + selected_kernel = kernel_distance_2; + break; + case 3: + selected_kernel = kernel_distance_3; + break; + case 4: + selected_kernel = kernel_distance_4; + break; + default: + selected_kernel = kernel_distance_n; + break; + } + } + + + selected_kernel + << < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + max_distance.data_ptr(), + grid_features.data_ptr(), + grid_params.data_ptr(), + obb_pose.data_ptr(), + obb_enable.data_ptr(), + n_env_obb.data_ptr(), + env_query_idx.data_ptr(), + max_nobs, + num_voxels, + batch_size, + horizon, n_spheres, transform_back); + + })); + + + + + + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return { distance, closest_point, sparsity_idx }; +} + + + +std::vector +swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3 + torch::Tensor distance, + torch::Tensor closest_point, // batch size, 3 + torch::Tensor sparsity_idx, const torch::Tensor weight, + const torch::Tensor activation_distance, + const torch::Tensor max_distance, + const torch::Tensor speed_dt, + const torch::Tensor grid_features, // n_boxes, 4, 4 + const torch::Tensor grid_params, // n_boxes, 3 + const torch::Tensor grid_pose, // n_boxes, 4, 4 + const torch::Tensor grid_enable, // n_boxes, 4, 4 + const torch::Tensor n_env_grid, + const torch::Tensor env_query_idx, // n_boxes, 4, 4 + const int max_nobs, + const int batch_size, + const int horizon, + const int n_spheres, + const int sweep_steps, + const bool enable_speed_metric, + const bool transform_back, + const bool compute_distance, + const bool use_batch_env, + const bool sum_collisions) +{ + using namespace Curobo::Geometry; + + const at::cuda::OptionalCUDAGuard guard(sphere_position.device()); + CHECK_INPUT(sphere_position); + CHECK_INPUT(distance); + CHECK_INPUT(closest_point); + CHECK_INPUT(sparsity_idx); + CHECK_INPUT(weight); + CHECK_INPUT(activation_distance); + CHECK_INPUT(max_distance); + CHECK_INPUT(speed_dt); + CHECK_INPUT(grid_features); + CHECK_INPUT(grid_params); + CHECK_INPUT(grid_pose); + CHECK_INPUT(grid_enable); + CHECK_INPUT(n_env_grid); + CHECK_INPUT(env_query_idx); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const int bnh_spheres = n_spheres * batch_size * horizon; // + const bool scale_metric = true; + const int num_voxels = grid_features.size(-2); + int threadsPerBlock = bnh_spheres; + + if (threadsPerBlock > 128) + { + threadsPerBlock = 128; + } + // bfloat16 + //ScalarType::Float8_e4m3fn + + int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock; + + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2 + const auto fp8_type = torch::kFloat8_e4m3fn; + #else + const auto fp8_type = torch::kHalf; + #endif + + AT_DISPATCH_FLOATING_TYPES_AND2(torch::kBFloat16, fp8_type, + grid_features.scalar_type(), "SphereVoxel_clpt", ([&] { + + auto collision_kernel_n = swept_sphere_voxel_distance_jump_kernel; + auto collision_kernel_1 = swept_sphere_voxel_distance_jump_kernel; + auto collision_kernel_2 = swept_sphere_voxel_distance_jump_kernel; + auto collision_kernel_3 = swept_sphere_voxel_distance_jump_kernel; + auto collision_kernel_4 = swept_sphere_voxel_distance_jump_kernel; + auto selected_kernel = collision_kernel_n; + switch (max_nobs){ + case 1: + selected_kernel = collision_kernel_1; + break; + case 2: + selected_kernel = collision_kernel_2; + break; + case 3: + selected_kernel = collision_kernel_3; + break; + case 4: + selected_kernel = collision_kernel_4; + break; + default: + selected_kernel = collision_kernel_n; + } + + selected_kernel<< < blocksPerGrid, threadsPerBlock, 0, stream >> > ( + sphere_position.data_ptr(), + distance.data_ptr(), + closest_point.data_ptr(), + sparsity_idx.data_ptr(), + weight.data_ptr(), + activation_distance.data_ptr(), + max_distance.data_ptr(), + speed_dt.data_ptr(), + grid_features.data_ptr(), + grid_params.data_ptr(), + grid_pose.data_ptr(), + grid_enable.data_ptr(), + n_env_grid.data_ptr(), + env_query_idx.data_ptr(), + max_nobs, + num_voxels, + batch_size, + horizon, n_spheres, sweep_steps, transform_back); + })); + + + + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return { distance, closest_point, sparsity_idx }; +} + diff --git a/src/curobo/curobolib/cpp/tensor_step_kernel.cu b/src/curobo/curobolib/cpp/tensor_step_kernel.cu index 933053b..a496031 100644 --- a/src/curobo/curobolib/cpp/tensor_step_kernel.cu +++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu @@ -342,10 +342,8 @@ namespace Curobo float out_pos = 0.0, out_vel = 0.0, out_acc = 0.0, out_jerk = 0.0; float st_pos = 0.0, st_vel = 0.0, st_acc = 0.0; - const int b_addrs = b_idx * horizon * dof; const int b_addrs_action = b_idx * (horizon - 4) * dof; float in_pos[5]; // create a 5 value scalar - const float acc_scale = 1.0; #pragma unroll 5 diff --git a/src/curobo/curobolib/geom.py b/src/curobo/curobolib/geom.py index fd1caf0..324d010 100644 --- a/src/curobo/curobolib/geom.py +++ b/src/curobo/curobolib/geom.py @@ -13,6 +13,7 @@ import torch # CuRobo from curobo.util.logger import log_warn +from curobo.util.torch_utils import get_torch_jit_decorator try: # CuRobo @@ -235,7 +236,7 @@ def get_pose_distance_backward( return r[0], r[1] -@torch.jit.script +@get_torch_jit_decorator() def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec): grad_vec = grad_g_dist + (grad_out_distance * weight) grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec @@ -243,7 +244,7 @@ def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec): # full method: -@torch.jit.script +@get_torch_jit_decorator() def backward_full_PoseError_jit( grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q ): @@ -570,6 +571,7 @@ class SdfSphereOBB(torch.autograd.Function): sparsity_idx, weight, activation_distance, + max_distance, box_accel, box_dims, box_pose, @@ -584,6 +586,8 @@ class SdfSphereOBB(torch.autograd.Function): compute_distance, use_batch_env, return_loss: bool = False, + sum_collisions: bool = True, + compute_esdf: bool = False, ): r = geom_cu.closest_point( query_sphere, @@ -592,6 +596,7 @@ class SdfSphereOBB(torch.autograd.Function): sparsity_idx, weight, activation_distance, + max_distance, box_accel, box_dims, box_pose, @@ -605,8 +610,11 @@ class SdfSphereOBB(torch.autograd.Function): transform_back, compute_distance, use_batch_env, + sum_collisions, + compute_esdf, ) # r[1][r[1]!=r[1]] = 0.0 + ctx.compute_esdf = compute_esdf ctx.return_loss = return_loss ctx.save_for_backward(r[1]) return r[0] @@ -615,6 +623,8 @@ class SdfSphereOBB(torch.autograd.Function): def backward(ctx, grad_output): grad_pt = None if ctx.needs_input_grad[0]: + # if ctx.compute_esdf: + # raise NotImplementedError("Gradients not implemented for compute_esdf=True") (r,) = ctx.saved_tensors if ctx.return_loss: r = r * grad_output.unsqueeze(-1) @@ -640,6 +650,9 @@ class SdfSphereOBB(torch.autograd.Function): None, None, None, + None, + None, + None, ) @@ -670,6 +683,7 @@ class SdfSweptSphereOBB(torch.autograd.Function): compute_distance, use_batch_env, return_loss: bool = False, + sum_collisions: bool = True, ): r = geom_cu.swept_closest_point( query_sphere, @@ -694,6 +708,7 @@ class SdfSweptSphereOBB(torch.autograd.Function): transform_back, compute_distance, use_batch_env, + sum_collisions, ) ctx.return_loss = return_loss ctx.save_for_backward( @@ -733,4 +748,200 @@ class SdfSweptSphereOBB(torch.autograd.Function): None, None, None, + None, + ) + + +class SdfSphereVoxel(torch.autograd.Function): + @staticmethod + def forward( + ctx, + query_sphere, + out_buffer, + grad_out_buffer, + sparsity_idx, + weight, + activation_distance, + max_distance, + grid_features, + grid_params, + grid_pose, + grid_enable, + n_env_grid, + env_query_idx, + max_nobs, + batch_size, + horizon, + n_spheres, + transform_back, + compute_distance, + use_batch_env, + return_loss: bool = False, + sum_collisions: bool = True, + compute_esdf: bool = False, + ): + + r = geom_cu.closest_point_voxel( + query_sphere, + out_buffer, + grad_out_buffer, + sparsity_idx, + weight, + activation_distance, + max_distance, + grid_features, + grid_params, + grid_pose, + grid_enable, + n_env_grid, + env_query_idx, + max_nobs, + batch_size, + horizon, + n_spheres, + transform_back, + compute_distance, + use_batch_env, + sum_collisions, + compute_esdf, + ) + ctx.compute_esdf = compute_esdf + ctx.return_loss = return_loss + ctx.save_for_backward(r[1]) + return r[0] + + @staticmethod + def backward(ctx, grad_output): + grad_pt = None + if ctx.needs_input_grad[0]: + # if ctx.compute_esdf: + # raise NotImplementedError("Gradients not implemented for compute_esdf=True") + (r,) = ctx.saved_tensors + if ctx.return_loss: + r = r * grad_output.unsqueeze(-1) + grad_pt = r + return ( + grad_pt, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + + +class SdfSweptSphereVoxel(torch.autograd.Function): + @staticmethod + def forward( + ctx, + query_sphere, + out_buffer, + grad_out_buffer, + sparsity_idx, + weight, + activation_distance, + max_distance, + speed_dt, + grid_features, + grid_params, + grid_pose, + grid_enable, + n_env_grid, + env_query_idx, + max_nobs, + batch_size, + horizon, + n_spheres, + sweep_steps, + enable_speed_metric, + transform_back, + compute_distance, + use_batch_env, + return_loss: bool = False, + sum_collisions: bool = True, + ): + r = geom_cu.swept_closest_point_voxel( + query_sphere, + out_buffer, + grad_out_buffer, + sparsity_idx, + weight, + activation_distance, + max_distance, + speed_dt, + grid_features, + grid_params, + grid_pose, + grid_enable, + n_env_grid, + env_query_idx, + max_nobs, + batch_size, + horizon, + n_spheres, + sweep_steps, + enable_speed_metric, + transform_back, + compute_distance, + use_batch_env, + sum_collisions, + ) + + ctx.return_loss = return_loss + ctx.save_for_backward( + r[1], + ) + return r[0] + + @staticmethod + def backward(ctx, grad_output): + grad_pt = None + if ctx.needs_input_grad[0]: + (r,) = ctx.saved_tensors + if ctx.return_loss: + r = r * grad_output.unsqueeze(-1) + grad_pt = r + return ( + grad_pt, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, ) diff --git a/src/curobo/curobolib/opt.py b/src/curobo/curobolib/opt.py index 26b4759..0be088f 100644 --- a/src/curobo/curobolib/opt.py +++ b/src/curobo/curobolib/opt.py @@ -39,7 +39,8 @@ except ImportError: class LBFGScu(Function): @staticmethod - def _call_cuda( + def forward( + ctx, step_vec, rho_buffer, y_buffer, @@ -50,6 +51,7 @@ class LBFGScu(Function): grad_0, epsilon=0.1, stable_mode=False, + use_shared_buffers=True, ): m, b, v_dim, _ = y_buffer.shape @@ -67,39 +69,12 @@ class LBFGScu(Function): m, v_dim, stable_mode, + use_shared_buffers, ) step_v = R[0].view(step_vec.shape) - return step_v - - @staticmethod - def forward( - ctx, - step_vec, - rho_buffer, - y_buffer, - s_buffer, - q, - grad_q, - x_0, - grad_0, - epsilon=0.1, - stable_mode=False, - ): - R = LBFGScu._call_cuda( - step_vec, - rho_buffer, - y_buffer, - s_buffer, - q, - grad_q, - x_0, - grad_0, - epsilon=epsilon, - stable_mode=stable_mode, - ) # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map) - return R + return step_v @staticmethod def backward(ctx, grad_output): @@ -109,4 +84,5 @@ class LBFGScu(Function): None, None, None, + None, ) diff --git a/src/curobo/geom/cv.py b/src/curobo/geom/cv.py index f2df669..b2b9da0 100644 --- a/src/curobo/geom/cv.py +++ b/src/curobo/geom/cv.py @@ -12,8 +12,11 @@ # Third Party import torch +# CuRobo +from curobo.util.torch_utils import get_torch_jit_decorator -@torch.jit.script + +@get_torch_jit_decorator() def project_depth_to_pointcloud(depth_image: torch.Tensor, intrinsics_matrix: torch.Tensor): """Projects numpy depth image to point cloud. @@ -43,7 +46,7 @@ def project_depth_to_pointcloud(depth_image: torch.Tensor, intrinsics_matrix: to return raw_pc -@torch.jit.script +@get_torch_jit_decorator() def get_projection_rays(height: int, width: int, intrinsics_matrix: torch.Tensor): """Projects numpy depth image to point cloud. @@ -54,10 +57,10 @@ def get_projection_rays(height: int, width: int, intrinsics_matrix: torch.Tensor Returns: array of float (h, w, 3) """ - fx = intrinsics_matrix[:, 0, 0] - fy = intrinsics_matrix[:, 1, 1] - cx = intrinsics_matrix[:, 0, 2] - cy = intrinsics_matrix[:, 1, 2] + fx = intrinsics_matrix[:, 0:1, 0:1] + fy = intrinsics_matrix[:, 1:2, 1:2] + cx = intrinsics_matrix[:, 0:1, 2:3] + cy = intrinsics_matrix[:, 1:2, 2:3] input_x = torch.arange(width, dtype=torch.float32, device=intrinsics_matrix.device) input_y = torch.arange(height, dtype=torch.float32, device=intrinsics_matrix.device) @@ -73,7 +76,6 @@ def get_projection_rays(height: int, width: int, intrinsics_matrix: torch.Tensor device=intrinsics_matrix.device, dtype=torch.float32, ) - output_x = (input_x - cx) / fx output_y = (input_y - cy) / fy @@ -84,7 +86,7 @@ def get_projection_rays(height: int, width: int, intrinsics_matrix: torch.Tensor return rays -@torch.jit.script +@get_torch_jit_decorator() def project_pointcloud_to_depth( pointcloud: torch.Tensor, output_image: torch.Tensor, @@ -106,7 +108,7 @@ def project_pointcloud_to_depth( return output_image -@torch.jit.script +@get_torch_jit_decorator() def project_depth_using_rays( depth_image: torch.Tensor, rays: torch.Tensor, filter_origin: bool = False ): diff --git a/src/curobo/geom/sdf/sdf_grid.py b/src/curobo/geom/sdf/sdf_grid.py index 1463f12..a2fb326 100644 --- a/src/curobo/geom/sdf/sdf_grid.py +++ b/src/curobo/geom/sdf/sdf_grid.py @@ -11,8 +11,10 @@ # Third Party import torch +# from curobo.util.torch_utils import get_torch_jit_decorator -# @torch.jit.script + +# @get_torch_jit_decorator() def lookup_distance(pt, dist_matrix_flat, num_voxels): # flatten: ind_pt = ( @@ -22,7 +24,7 @@ def lookup_distance(pt, dist_matrix_flat, num_voxels): return dist -# @torch.jit.script +# @get_torch_jit_decorator() def compute_sdf_gradient(pt, dist_matrix_flat, num_voxels, dist): grad_l = [] for i in range(3): # x,y,z diff --git a/src/curobo/geom/sdf/utils.py b/src/curobo/geom/sdf/utils.py index 9d482c1..133c96b 100644 --- a/src/curobo/geom/sdf/utils.py +++ b/src/curobo/geom/sdf/utils.py @@ -30,6 +30,11 @@ def create_collision_checker(config: WorldCollisionConfig): from curobo.geom.sdf.world_mesh import WorldMeshCollision return WorldMeshCollision(config) + elif config.checker_type == CollisionCheckerType.VOXEL: + # CuRobo + from curobo.geom.sdf.world_voxel import WorldVoxelCollision + + return WorldVoxelCollision(config) else: - log_error("Not implemented", exc_info=True) + log_error("Unknown Collision Checker type: " + config.checker_type, exc_info=True) raise NotImplementedError diff --git a/src/curobo/geom/sdf/warp_primitives.py b/src/curobo/geom/sdf/warp_primitives.py index 93bfbee..b748122 100644 --- a/src/curobo/geom/sdf/warp_primitives.py +++ b/src/curobo/geom/sdf/warp_primitives.py @@ -16,284 +16,6 @@ import warp as wp wp.set_module_options({"fast_math": False}) -# create warp kernels: -@wp.kernel -def get_swept_closest_pt( - pt: wp.array(dtype=wp.vec4), - distance: wp.array(dtype=wp.float32), # this stores the output cost - closest_pt: wp.array(dtype=wp.float32), # this stores the gradient - sparsity_idx: wp.array(dtype=wp.uint8), - weight: wp.array(dtype=wp.float32), - activation_distance: wp.array(dtype=wp.float32), # eta threshold - speed_dt: wp.array(dtype=wp.float32), - mesh: wp.array(dtype=wp.uint64), - mesh_pose: wp.array(dtype=wp.float32), - mesh_enable: wp.array(dtype=wp.uint8), - n_env_mesh: wp.array(dtype=wp.int32), - max_dist: wp.float32, - write_grad: wp.uint8, - batch_size: wp.int32, - horizon: wp.int32, - nspheres: wp.int32, - max_nmesh: wp.int32, - sweep_steps: wp.uint8, - enable_speed_metric: wp.uint8, -): - # we launch nspheres kernels - # compute gradient here and return - # distance is negative outside and positive inside - tid = int(0) - tid = wp.tid() - - b_idx = int(0) - h_idx = int(0) - sph_idx = int(0) - # read horizon - eta = float(0.0) # 5cm buffer - dt = float(1.0) - - b_idx = tid / (horizon * nspheres) - - h_idx = (tid - (b_idx * (horizon * nspheres))) / nspheres - sph_idx = tid - (b_idx * horizon * nspheres) - (h_idx * nspheres) - if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres: - return - - n_mesh = int(0) - # $wp.printf("%d, %d, %d, %d \n", tid, b_idx, h_idx, sph_idx) - # read sphere - sphere_0_distance = float(0.0) - sphere_2_distance = float(0.0) - - sphere_0 = wp.vec3(0.0) - sphere_2 = wp.vec3(0.0) - sphere_int = wp.vec3(0.0) - sphere_temp = wp.vec3(0.0) - k0 = float(0.0) - face_index = int(0) - face_u = float(0.0) - face_v = float(0.0) - sign = float(0.0) - dist = float(0.0) - - uint_zero = wp.uint8(0) - uint_one = wp.uint8(1) - cl_pt = wp.vec3(0.0) - local_pt = wp.vec3(0.0) - in_sphere = pt[b_idx * horizon * nspheres + (h_idx * nspheres) + sph_idx] - in_rad = in_sphere[3] - if in_rad < 0.0: - distance[tid] = 0.0 - if write_grad == 1 and sparsity_idx[tid] == uint_one: - sparsity_idx[tid] = uint_zero - closest_pt[tid * 4] = 0.0 - closest_pt[tid * 4 + 1] = 0.0 - closest_pt[tid * 4 + 2] = 0.0 - - return - eta = activation_distance[0] - dt = speed_dt[0] - in_rad += eta - - max_dist_buffer = float(0.0) - max_dist_buffer = max_dist - if in_rad > max_dist_buffer: - max_dist_buffer += in_rad - in_pt = wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2]) - - # read in sphere 0: - # read in sphere 0: - if h_idx > 0: - in_sphere = pt[b_idx * horizon * nspheres + ((h_idx - 1) * nspheres) + sph_idx] - sphere_0 += wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2]) - sphere_0_distance = wp.length(sphere_0 - in_pt) / 2.0 - - if h_idx < horizon - 1: - in_sphere = pt[b_idx * horizon * nspheres + ((h_idx + 1) * nspheres) + sph_idx] - sphere_2 += wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2]) - sphere_2_distance = wp.length(sphere_2 - in_pt) / 2.0 - - # read in sphere 2: - closest_distance = float(0.0) - closest_point = wp.vec3(0.0) - i = int(0) - dis_length = float(0.0) - jump_distance = float(0.0) - mid_distance = float(0.0) - n_mesh = n_env_mesh[0] - obj_position = wp.vec3() - - while i < n_mesh: - if mesh_enable[i] == uint_one: - obj_position[0] = mesh_pose[i * 8 + 0] - obj_position[1] = mesh_pose[i * 8 + 1] - obj_position[2] = mesh_pose[i * 8 + 2] - obj_quat = wp.quaternion( - mesh_pose[i * 8 + 4], - mesh_pose[i * 8 + 5], - mesh_pose[i * 8 + 6], - mesh_pose[i * 8 + 3], - ) - - obj_w_pose = wp.transform(obj_position, obj_quat) - obj_w_pose_t = wp.transform_inverse(obj_w_pose) - # transform point to mesh frame: - # mesh_pt = T_inverse @ w_pt - local_pt = wp.transform_point(obj_w_pose, in_pt) - - if wp.mesh_query_point( - mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v - ): - cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - local_pt) - dist = (-1.0 * dis_length * sign) + in_rad - if dist > 0: - cl_pt = sign * (cl_pt - local_pt) / dis_length - grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) - if dist > eta: - dist_metric = dist - 0.5 * eta - elif dist <= eta: - dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec - - closest_distance += dist_metric - closest_point += grad_vec - else: - dist = -1.0 * dist - else: - dist = in_rad - dist = max(dist - in_rad, in_rad) - - mid_distance = dist - # transform sphere -1 - if h_idx > 0 and mid_distance < sphere_0_distance: - jump_distance = mid_distance - j = int(0) - sphere_temp = wp.transform_point(obj_w_pose, sphere_0) - while j < sweep_steps: - k0 = ( - 1.0 - 0.5 * jump_distance / sphere_0_distance - ) # dist could be greater than sphere_0_distance here? - sphere_int = k0 * local_pt + ((1.0 - k0) * sphere_temp) - if wp.mesh_query_point( - mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v - ): - cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - sphere_int) - dist = (-1.0 * dis_length * sign) + in_rad - if dist > 0: - cl_pt = sign * (cl_pt - sphere_int) / dis_length - grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) - if dist > eta: - dist_metric = dist - 0.5 * eta - elif dist <= eta: - dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec - - closest_distance += dist_metric - closest_point += grad_vec - dist = max(dist - in_rad, in_rad) - jump_distance += dist - else: - dist = max(-dist - in_rad, in_rad) - - jump_distance += dist - else: - jump_distance += max_dist_buffer - j += 1 - if jump_distance >= sphere_0_distance: - j = int(sweep_steps) - # transform sphere -1 - if h_idx < horizon - 1 and mid_distance < sphere_2_distance: - jump_distance = mid_distance - j = int(0) - sphere_temp = wp.transform_point(obj_w_pose, sphere_2) - while j < sweep_steps: - k0 = ( - 1.0 - 0.5 * jump_distance / sphere_2_distance - ) # dist could be greater than sphere_0_distance here? - sphere_int = k0 * local_pt + (1.0 - k0) * sphere_temp - - if wp.mesh_query_point( - mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v - ): - cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - sphere_int) - dist = (-1.0 * dis_length * sign) + in_rad - if dist > 0: - cl_pt = sign * (cl_pt - sphere_int) / dis_length - grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) - if dist > eta: - dist_metric = dist - 0.5 * eta - elif dist <= eta: - dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec - - closest_distance += dist_metric - closest_point += grad_vec - dist = max(dist - in_rad, in_rad) - - jump_distance += dist - else: - dist = max(-dist - in_rad, in_rad) - - jump_distance += dist - else: - jump_distance += max_dist_buffer - - j += 1 - if jump_distance >= sphere_2_distance: - j = int(sweep_steps) - - i += 1 - # return - if closest_distance == 0: - if sparsity_idx[tid] == uint_zero: - return - sparsity_idx[tid] = uint_zero - distance[tid] = 0.0 - if write_grad == 1: - closest_pt[tid * 4 + 0] = 0.0 - closest_pt[tid * 4 + 1] = 0.0 - closest_pt[tid * 4 + 2] = 0.0 - return - - if enable_speed_metric == 1 and (h_idx > 0 and h_idx < horizon - 1): - # calculate sphere velocity and acceleration: - norm_vel_vec = wp.vec3(0.0) - sph_acc_vec = wp.vec3(0.0) - sph_vel = wp.float(0.0) - - # use central difference - norm_vel_vec = (0.5 / dt) * (sphere_2 - sphere_0) - sph_acc_vec = (1.0 / (dt * dt)) * (sphere_0 + sphere_2 - 2.0 * in_pt) - # norm_vel_vec = -1.0 * norm_vel_vec - # sph_acc_vec = -1.0 * sph_acc_vec - sph_vel = wp.length(norm_vel_vec) - - norm_vel_vec = norm_vel_vec / sph_vel - - orth_proj = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) - wp.outer( - norm_vel_vec, norm_vel_vec - ) - - curvature_vec = orth_proj * (sph_acc_vec / (sph_vel * sph_vel)) - - closest_point = sph_vel * ((orth_proj * closest_point) - closest_distance * curvature_vec) - - closest_distance = sph_vel * closest_distance - - distance[tid] = weight[0] * closest_distance - sparsity_idx[tid] = uint_one - if write_grad == 1: - # compute gradient: - if closest_distance > 0.0: - closest_distance = weight[0] - closest_pt[tid * 4 + 0] = closest_distance * closest_point[0] - closest_pt[tid * 4 + 1] = closest_distance * closest_point[1] - closest_pt[tid * 4 + 2] = closest_distance * closest_point[2] - - @wp.kernel def get_swept_closest_pt_batch_env( pt: wp.array(dtype=wp.vec4), @@ -307,7 +29,7 @@ def get_swept_closest_pt_batch_env( mesh_pose: wp.array(dtype=wp.float32), mesh_enable: wp.array(dtype=wp.uint8), n_env_mesh: wp.array(dtype=wp.int32), - max_dist: wp.float32, + max_dist: wp.array(dtype=wp.float32), write_grad: wp.uint8, batch_size: wp.int32, horizon: wp.int32, @@ -316,6 +38,7 @@ def get_swept_closest_pt_batch_env( sweep_steps: wp.uint8, enable_speed_metric: wp.uint8, env_query_idx: wp.array(dtype=wp.int32), + use_batch_env: wp.uint8, ): # we launch nspheres kernels # compute gradient here and return @@ -357,6 +80,7 @@ def get_swept_closest_pt_batch_env( sign = float(0.0) dist = float(0.0) dist_metric = float(0.0) + euclidean_distance = float(0.0) cl_pt = wp.vec3(0.0) local_pt = wp.vec3(0.0) in_sphere = pt[b_idx * horizon * nspheres + (h_idx * nspheres) + sph_idx] @@ -374,7 +98,7 @@ def get_swept_closest_pt_batch_env( eta = activation_distance[0] in_rad += eta max_dist_buffer = float(0.0) - max_dist_buffer = max_dist + max_dist_buffer = max_dist[0] if (in_rad) > max_dist_buffer: max_dist_buffer += in_rad @@ -396,7 +120,8 @@ def get_swept_closest_pt_batch_env( dis_length = float(0.0) jump_distance = float(0.0) mid_distance = float(0.0) - env_idx = env_query_idx[b_idx] + if use_batch_env: + env_idx = env_query_idx[b_idx] i = max_nmesh * env_idx n_mesh = i + n_env_mesh[env_idx] obj_position = wp.vec3() @@ -423,26 +148,33 @@ def get_swept_closest_pt_batch_env( mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v ): cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - local_pt) + delta = cl_pt - local_pt + dis_length = wp.length(delta) dist = (-1.0 * dis_length * sign) + in_rad if dist > 0: - cl_pt = sign * (cl_pt - local_pt) / dis_length - grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) + if dist == in_rad: + cl_pt = sign * (delta) / (dist) + else: + cl_pt = sign * (delta) / dis_length + euclidean_distance = dist if dist > eta: dist_metric = dist - 0.5 * eta elif dist <= eta: dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec + cl_pt = (1.0 / eta) * dist * cl_pt + grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) closest_distance += dist_metric closest_point += grad_vec else: dist = -1.0 * dist + euclidean_distance = dist else: dist = max_dist_buffer - dist = max(dist - in_rad, in_rad) + euclidean_distance = dist + dist = max(euclidean_distance - in_rad, in_rad) - mid_distance = dist + mid_distance = euclidean_distance # transform sphere -1 if h_idx > 0 and mid_distance < sphere_0_distance: jump_distance = mid_distance @@ -457,24 +189,31 @@ def get_swept_closest_pt_batch_env( mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v ): cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - sphere_int) + delta = cl_pt - sphere_int + dis_length = wp.length(delta) dist = (-1.0 * dis_length * sign) + in_rad if dist > 0: - cl_pt = sign * (cl_pt - sphere_int) / dis_length - grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) + if dist == in_rad: + cl_pt = sign * (delta) / (dist) + else: + cl_pt = sign * (delta) / dis_length + euclidean_distance = dist if dist > eta: dist_metric = dist - 0.5 * eta elif dist <= eta: dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec + cl_pt = (1.0 / eta) * dist * cl_pt closest_distance += dist_metric + grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) + closest_point += grad_vec - dist = max(dist - in_rad, in_rad) - jump_distance += dist + dist = max(euclidean_distance - in_rad, in_rad) + jump_distance += euclidean_distance else: dist = max(-dist - in_rad, in_rad) - jump_distance += dist + euclidean_distance = dist + jump_distance += euclidean_distance else: jump_distance += max_dist_buffer j += 1 @@ -495,24 +234,30 @@ def get_swept_closest_pt_batch_env( mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v ): cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - sphere_int) + delta = cl_pt - sphere_int + dis_length = wp.length(delta) dist = (-1.0 * dis_length * sign) + in_rad if dist > 0: - cl_pt = sign * (cl_pt - sphere_int) / dis_length - grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) + euclidean_distance = dist + if dist == in_rad: + cl_pt = sign * (delta) / (dist) + else: + cl_pt = sign * (delta) / dis_length + # cl_pt = sign * (delta) / dis_length if dist > eta: dist_metric = dist - 0.5 * eta elif dist <= eta: dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec + cl_pt = (1.0 / eta) * dist * cl_pt closest_distance += dist_metric + grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt) + closest_point += grad_vec - dist = max(dist - in_rad, in_rad) + dist = max(euclidean_distance - in_rad, in_rad) jump_distance += dist else: dist = max(-dist - in_rad, in_rad) - jump_distance += dist else: jump_distance += max_dist_buffer @@ -542,179 +287,54 @@ def get_swept_closest_pt_batch_env( # use central difference norm_vel_vec = (0.5 / dt) * (sphere_2 - sphere_0) - sph_acc_vec = (1.0 / (dt * dt)) * (sphere_0 + sphere_2 - 2.0 * in_pt) - # norm_vel_vec = -1.0 * norm_vel_vec - # sph_acc_vec = -1.0 * sph_acc_vec sph_vel = wp.length(norm_vel_vec) + if sph_vel > 1e-3: + sph_acc_vec = (1.0 / (dt * dt)) * (sphere_0 + sphere_2 - 2.0 * in_pt) - norm_vel_vec = norm_vel_vec / sph_vel + norm_vel_vec = norm_vel_vec * (1.0 / sph_vel) - orth_proj = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) - wp.outer( - norm_vel_vec, norm_vel_vec - ) + curvature_vec = sph_acc_vec / (sph_vel * sph_vel) - curvature_vec = orth_proj * (sph_acc_vec / (sph_vel * sph_vel)) + orth_proj = wp.mat33(0.0) + for i in range(3): + for j in range(3): + orth_proj[i, j] = -1.0 * norm_vel_vec[i] * norm_vel_vec[j] - closest_point = sph_vel * ((orth_proj * closest_point) - closest_distance * curvature_vec) + orth_proj[0, 0] = orth_proj[0, 0] + 1.0 + orth_proj[1, 1] = orth_proj[1, 1] + 1.0 + orth_proj[2, 2] = orth_proj[2, 2] + 1.0 - closest_distance = sph_vel * closest_distance + orth_curv = wp.vec3( + 0.0, 0.0, 0.0 + ) # closest_distance * (orth_proj @ curvature_vec) #wp.matmul(orth_proj, curvature_vec) + orth_pt = wp.vec3(0.0, 0.0, 0.0) # orth_proj @ closest_point + for i in range(3): + orth_pt[i] = ( + orth_proj[i, 0] * closest_point[0] + + orth_proj[i, 1] * closest_point[1] + + orth_proj[i, 2] * closest_point[2] + ) + + orth_curv[i] = closest_distance * ( + orth_proj[i, 0] * curvature_vec[0] + + orth_proj[i, 1] * curvature_vec[1] + + orth_proj[i, 2] * curvature_vec[2] + ) + + closest_point = sph_vel * (orth_pt - orth_curv) + + closest_distance = sph_vel * closest_distance distance[tid] = weight[0] * closest_distance sparsity_idx[tid] = uint_one if write_grad == 1: # compute gradient: - if closest_distance > 0.0: - closest_distance = weight[0] + closest_distance = weight[0] closest_pt[tid * 4 + 0] = closest_distance * closest_point[0] closest_pt[tid * 4 + 1] = closest_distance * closest_point[1] closest_pt[tid * 4 + 2] = closest_distance * closest_point[2] -@wp.kernel -def get_closest_pt( - pt: wp.array(dtype=wp.vec4), - distance: wp.array(dtype=wp.float32), # this stores the output cost - closest_pt: wp.array(dtype=wp.float32), # this stores the gradient - sparsity_idx: wp.array(dtype=wp.uint8), - weight: wp.array(dtype=wp.float32), - activation_distance: wp.array(dtype=wp.float32), # eta threshold - mesh: wp.array(dtype=wp.uint64), - mesh_pose: wp.array(dtype=wp.float32), - mesh_enable: wp.array(dtype=wp.uint8), - n_env_mesh: wp.array(dtype=wp.int32), - max_dist: wp.float32, - write_grad: wp.uint8, - batch_size: wp.int32, - horizon: wp.int32, - nspheres: wp.int32, - max_nmesh: wp.int32, -): - # we launch nspheres kernels - # compute gradient here and return - # distance is negative outside and positive inside - tid = wp.tid() - n_mesh = int(0) - b_idx = int(0) - h_idx = int(0) - sph_idx = int(0) - - # env_idx = int(0) - - b_idx = tid / (horizon * nspheres) - - h_idx = (tid - (b_idx * (horizon * nspheres))) / nspheres - sph_idx = tid - (b_idx * horizon * nspheres) - (h_idx * nspheres) - if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres: - return - - face_index = int(0) - face_u = float(0.0) - face_v = float(0.0) - sign = float(0.0) - dist = float(0.0) - grad_vec = wp.vec3(0.0) - eta = float(0.05) - dist_metric = float(0.0) - - cl_pt = wp.vec3(0.0) - local_pt = wp.vec3(0.0) - in_sphere = pt[tid] - in_pt = wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2]) - in_rad = in_sphere[3] - uint_zero = wp.uint8(0) - uint_one = wp.uint8(1) - if in_rad < 0.0: - distance[tid] = 0.0 - if write_grad == 1 and sparsity_idx[tid] == uint_one: - sparsity_idx[tid] = uint_zero - closest_pt[tid * 4] = 0.0 - closest_pt[tid * 4 + 1] = 0.0 - closest_pt[tid * 4 + 2] = 0.0 - return - - eta = activation_distance[0] - in_rad += eta - max_dist_buffer = float(0.0) - max_dist_buffer = max_dist - if in_rad > max_dist_buffer: - max_dist_buffer += in_rad - - # TODO: read vec4 and use first 3 for sphere position and last one for radius - # in_pt = pt[tid] - closest_distance = float(0.0) - closest_point = wp.vec3(0.0) - i = int(0) - dis_length = float(0.0) - # read env index: - # env_idx = env_query_idx[b_idx] - # read number of boxes in current environment: - - # get start index - i = int(0) - n_mesh = n_env_mesh[0] - obj_position = wp.vec3() - - # mesh_idx = wp.uint64(0) - while i < n_mesh: - if mesh_enable[i] == uint_one: - # transform point to mesh frame: - # mesh_pt = T_inverse @ w_pt - - # read object pose: - obj_position[0] = mesh_pose[i * 8 + 0] - obj_position[1] = mesh_pose[i * 8 + 1] - obj_position[2] = mesh_pose[i * 8 + 2] - obj_quat = wp.quaternion( - mesh_pose[i * 8 + 4], - mesh_pose[i * 8 + 5], - mesh_pose[i * 8 + 6], - mesh_pose[i * 8 + 3], - ) - - obj_w_pose = wp.transform(obj_position, obj_quat) - - local_pt = wp.transform_point(obj_w_pose, in_pt) - # mesh_idx = mesh[i] - if wp.mesh_query_point( - mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v - ): - cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - local_pt) - dist = (-1.0 * dis_length * sign) + in_rad - if dist > 0: - cl_pt = sign * (cl_pt - local_pt) / dis_length - grad_vec = wp.transform_vector(wp.transform_inverse(obj_w_pose), cl_pt) - if dist > eta: - dist_metric = dist - 0.5 * eta - elif dist <= eta: - dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec - closest_distance += dist_metric - closest_point += grad_vec - - i += 1 - - if closest_distance == 0: - if sparsity_idx[tid] == uint_zero: - return - sparsity_idx[tid] = uint_zero - distance[tid] = 0.0 - if write_grad == 1: - closest_pt[tid * 4 + 0] = 0.0 - closest_pt[tid * 4 + 1] = 0.0 - closest_pt[tid * 4 + 2] = 0.0 - else: - distance[tid] = weight[0] * closest_distance - sparsity_idx[tid] = uint_one - if write_grad == 1: - # compute gradient: - if closest_distance > 0.0: - closest_distance = weight[0] - closest_pt[tid * 4 + 0] = closest_distance * closest_point[0] - closest_pt[tid * 4 + 1] = closest_distance * closest_point[1] - closest_pt[tid * 4 + 2] = closest_distance * closest_point[2] - - @wp.kernel def get_closest_pt_batch_env( pt: wp.array(dtype=wp.vec4), @@ -727,13 +347,15 @@ def get_closest_pt_batch_env( mesh_pose: wp.array(dtype=wp.float32), mesh_enable: wp.array(dtype=wp.uint8), n_env_mesh: wp.array(dtype=wp.int32), - max_dist: wp.float32, + max_dist: wp.array(dtype=wp.float32), write_grad: wp.uint8, batch_size: wp.int32, horizon: wp.int32, nspheres: wp.int32, max_nmesh: wp.int32, env_query_idx: wp.array(dtype=wp.int32), + use_batch_env: wp.uint8, + compute_esdf: wp.uint8, ): # we launch nspheres kernels # compute gradient here and return @@ -779,8 +401,9 @@ def get_closest_pt_batch_env( return eta = activation_distance[0] - in_rad += eta - max_dist_buffer = max_dist + if compute_esdf != 1: + in_rad += eta + max_dist_buffer = max_dist[0] if (in_rad) > max_dist_buffer: max_dist_buffer += in_rad @@ -791,7 +414,9 @@ def get_closest_pt_batch_env( dis_length = float(0.0) # read env index: - env_idx = env_query_idx[b_idx] + if use_batch_env: + env_idx = env_query_idx[b_idx] + # read number of boxes in current environment: # get start index @@ -799,7 +424,9 @@ def get_closest_pt_batch_env( i = max_nmesh * env_idx n_mesh = i + n_env_mesh[env_idx] obj_position = wp.vec3() - + max_dist_value = -1.0 * max_dist_buffer + if compute_esdf == 1: + closest_distance = max_dist_value while i < n_mesh: if mesh_enable[i] == uint_one: # transform point to mesh frame: @@ -822,21 +449,39 @@ def get_closest_pt_batch_env( mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v ): cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v) - dis_length = wp.length(cl_pt - local_pt) - dist = (-1.0 * dis_length * sign) + in_rad - if dist > 0: - cl_pt = sign * (cl_pt - local_pt) / dis_length - grad_vec = wp.transform_vector(wp.transform_inverse(obj_w_pose), cl_pt) - if dist > eta: - dist_metric = dist - 0.5 * eta - elif dist <= eta: - dist_metric = (0.5 / eta) * (dist) * dist - grad_vec = (1.0 / eta) * dist * grad_vec - closest_distance += dist_metric - closest_point += grad_vec + delta = cl_pt - local_pt + dis_length = wp.length(delta) + dist = -1.0 * dis_length * sign + + if compute_esdf == 1: + if dist > max_dist_value: + max_dist_value = dist + closest_distance = dist + if write_grad == 1: + cl_pt = sign * (delta) / dis_length + grad_vec = wp.transform_vector(wp.transform_inverse(obj_w_pose), cl_pt) + closest_point = grad_vec + + else: + dist = dist + in_rad + + if dist > 0: + if dist == in_rad: + cl_pt = sign * (delta) / (dist) + else: + cl_pt = sign * (delta) / dis_length + if dist > eta: + dist_metric = dist - 0.5 * eta + elif dist <= eta: + dist_metric = (0.5 / eta) * (dist) * dist + cl_pt = (1.0 / eta) * dist * cl_pt + grad_vec = wp.transform_vector(wp.transform_inverse(obj_w_pose), cl_pt) + + closest_distance += dist_metric + closest_point += grad_vec i += 1 - if closest_distance == 0: + if closest_distance == 0 and compute_esdf != 1: if sparsity_idx[tid] == uint_zero: return sparsity_idx[tid] = uint_zero @@ -850,8 +495,7 @@ def get_closest_pt_batch_env( sparsity_idx[tid] = uint_one if write_grad == 1: # compute gradient: - if closest_distance > 0.0: - closest_distance = weight[0] + closest_distance = weight[0] closest_pt[tid * 4 + 0] = closest_distance * closest_point[0] closest_pt[tid * 4 + 1] = closest_distance * closest_point[1] closest_pt[tid * 4 + 2] = closest_distance * closest_point[2] @@ -871,62 +515,43 @@ class SdfMeshWarpPy(torch.autograd.Function): mesh_pose_inverse, mesh_enable, n_env_mesh, - max_dist=0.05, + max_dist, env_query_idx=None, return_loss=False, + compute_esdf=False, ): b, h, n, _ = query_spheres.shape - + use_batch_env = True if env_query_idx is None: - # launch - wp.launch( - kernel=get_closest_pt, - dim=b * h * n, - inputs=[ - wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4), - wp.from_torch(out_cost.view(-1)), - wp.from_torch(out_grad.view(-1), dtype=wp.float32), - wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8), - wp.from_torch(weight), - wp.from_torch(activation_distance), - wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64), - wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32), - wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8), - wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32), - max_dist, - query_spheres.requires_grad, - b, - h, - n, - mesh_idx.shape[1], - ], - stream=wp.stream_from_torch(query_spheres.device), - ) - else: - wp.launch( - kernel=get_closest_pt_batch_env, - dim=b * h * n, - inputs=[ - wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4), - wp.from_torch(out_cost.view(-1)), - wp.from_torch(out_grad.view(-1), dtype=wp.float32), - wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8), - wp.from_torch(weight), - wp.from_torch(activation_distance), - wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64), - wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32), - wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8), - wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32), - max_dist, - query_spheres.requires_grad, - b, - h, - n, - mesh_idx.shape[1], - wp.from_torch(env_query_idx.view(-1), dtype=wp.int32), - ], - stream=wp.stream_from_torch(query_spheres.device), - ) + use_batch_env = False + env_query_idx = n_env_mesh + + wp.launch( + kernel=get_closest_pt_batch_env, + dim=b * h * n, + inputs=[ + wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4), + wp.from_torch(out_cost.view(-1)), + wp.from_torch(out_grad.view(-1), dtype=wp.float32), + wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8), + wp.from_torch(weight), + wp.from_torch(activation_distance), + wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64), + wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32), + wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8), + wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32), + wp.from_torch(max_dist, dtype=wp.float32), + query_spheres.requires_grad, + b, + h, + n, + mesh_idx.shape[1], + wp.from_torch(env_query_idx.view(-1), dtype=wp.int32), + use_batch_env, + compute_esdf, + ], + stream=wp.stream_from_torch(query_spheres.device), + ) ctx.return_loss = return_loss ctx.save_for_backward(out_grad) return out_cost @@ -939,7 +564,22 @@ class SdfMeshWarpPy(torch.autograd.Function): grad_sph = r if ctx.return_loss: grad_sph = r * grad_output.unsqueeze(-1) - return grad_sph, None, None, None, None, None, None, None, None, None, None, None, None + return ( + grad_sph, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) class SweptSdfMeshWarpPy(torch.autograd.Function): @@ -957,69 +597,46 @@ class SweptSdfMeshWarpPy(torch.autograd.Function): mesh_pose_inverse, mesh_enable, n_env_mesh, + max_dist, sweep_steps=1, enable_speed_metric=False, - max_dist=0.05, env_query_idx=None, return_loss=False, ): b, h, n, _ = query_spheres.shape - + use_batch_env = True if env_query_idx is None: - wp.launch( - kernel=get_swept_closest_pt, - dim=b * h * n, - inputs=[ - wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4), - wp.from_torch(out_cost.view(-1)), - wp.from_torch(out_grad.view(-1), dtype=wp.float32), - wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8), - wp.from_torch(weight), - wp.from_torch(activation_distance), - wp.from_torch(speed_dt), - wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64), - wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32), - wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8), - wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32), - max_dist, - query_spheres.requires_grad, - b, - h, - n, - mesh_idx.shape[1], - sweep_steps, - enable_speed_metric, - ], - stream=wp.stream_from_torch(query_spheres.device), - ) - else: - wp.launch( - kernel=get_swept_closest_pt_batch_env, - dim=b * h * n, - inputs=[ - wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4), - wp.from_torch(out_cost.view(-1)), - wp.from_torch(out_grad.view(-1), dtype=wp.float32), - wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8), - wp.from_torch(weight), - wp.from_torch(activation_distance), - wp.from_torch(speed_dt), - wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64), - wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32), - wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8), - wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32), - max_dist, - query_spheres.requires_grad, - b, - h, - n, - mesh_idx.shape[1], - sweep_steps, - enable_speed_metric, - wp.from_torch(env_query_idx.view(-1), dtype=wp.int32), - ], - stream=wp.stream_from_torch(query_spheres.device), - ) + use_batch_env = False + env_query_idx = n_env_mesh + + wp.launch( + kernel=get_swept_closest_pt_batch_env, + dim=b * h * n, + inputs=[ + wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4), + wp.from_torch(out_cost.view(-1)), + wp.from_torch(out_grad.view(-1), dtype=wp.float32), + wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8), + wp.from_torch(weight), + wp.from_torch(activation_distance), + wp.from_torch(speed_dt), + wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64), + wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32), + wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8), + wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32), + wp.from_torch(max_dist, dtype=wp.float32), + query_spheres.requires_grad, + b, + h, + n, + mesh_idx.shape[1], + sweep_steps, + enable_speed_metric, + wp.from_torch(env_query_idx.view(-1), dtype=wp.int32), + use_batch_env, + ], + stream=wp.stream_from_torch(query_spheres.device), + ) ctx.return_loss = return_loss ctx.save_for_backward(out_grad) return out_cost diff --git a/src/curobo/geom/sdf/world.py b/src/curobo/geom/sdf/world.py index ffa4981..ae1ec18 100644 --- a/src/curobo/geom/sdf/world.py +++ b/src/curobo/geom/sdf/world.py @@ -19,7 +19,7 @@ import torch # CuRobo from curobo.curobolib.geom import SdfSphereOBB, SdfSweptSphereOBB -from curobo.geom.types import Cuboid, Mesh, Obstacle, WorldConfig, batch_tensor_cube +from curobo.geom.types import Cuboid, Mesh, Obstacle, VoxelGrid, WorldConfig, batch_tensor_cube from curobo.types.base import TensorDeviceType from curobo.types.math import Pose from curobo.util.logger import log_error, log_info, log_warn @@ -39,10 +39,14 @@ class CollisionBuffer: def initialize_from_shape(cls, shape: torch.Size, tensor_args: TensorDeviceType): batch, horizon, n_spheres, _ = shape distance_buffer = torch.zeros( - (batch, horizon, n_spheres), device=tensor_args.device, dtype=tensor_args.dtype + (batch, horizon, n_spheres), + device=tensor_args.device, + dtype=tensor_args.collision_distance_dtype, ) grad_distance_buffer = torch.zeros( - (batch, horizon, n_spheres, 4), device=tensor_args.device, dtype=tensor_args.dtype + (batch, horizon, n_spheres, 4), + device=tensor_args.device, + dtype=tensor_args.collision_gradient_dtype, ) sparsity_idx = torch.zeros( (batch, horizon, n_spheres), @@ -54,10 +58,14 @@ class CollisionBuffer: def _update_from_shape(self, shape: torch.Size, tensor_args: TensorDeviceType): batch, horizon, n_spheres, _ = shape self.distance_buffer = torch.zeros( - (batch, horizon, n_spheres), device=tensor_args.device, dtype=tensor_args.dtype + (batch, horizon, n_spheres), + device=tensor_args.device, + dtype=tensor_args.collision_distance_dtype, ) self.grad_distance_buffer = torch.zeros( - (batch, horizon, n_spheres, 4), device=tensor_args.device, dtype=tensor_args.dtype + (batch, horizon, n_spheres, 4), + device=tensor_args.device, + dtype=tensor_args.collision_gradient_dtype, ) self.sparsity_index_buffer = torch.zeros( (batch, horizon, n_spheres), @@ -100,6 +108,7 @@ class CollisionQueryBuffer: primitive_collision_buffer: Optional[CollisionBuffer] = None mesh_collision_buffer: Optional[CollisionBuffer] = None blox_collision_buffer: Optional[CollisionBuffer] = None + voxel_collision_buffer: Optional[CollisionBuffer] = None shape: Optional[torch.Size] = None def __post_init__(self): @@ -110,6 +119,8 @@ class CollisionQueryBuffer: self.shape = self.mesh_collision_buffer.shape elif self.blox_collision_buffer is not None: self.shape = self.blox_collision_buffer.shape + elif self.voxel_collision_buffer is not None: + self.shape = self.voxel_collision_buffer.shape def __mul__(self, scalar: float): if self.primitive_collision_buffer is not None: @@ -118,17 +129,27 @@ class CollisionQueryBuffer: self.mesh_collision_buffer = self.mesh_collision_buffer * scalar if self.blox_collision_buffer is not None: self.blox_collision_buffer = self.blox_collision_buffer * scalar + if self.voxel_collision_buffer is not None: + self.voxel_collision_buffer = self.voxel_collision_buffer * scalar return self def clone(self): - prim_buffer = mesh_buffer = blox_buffer = None + prim_buffer = mesh_buffer = blox_buffer = voxel_buffer = None if self.primitive_collision_buffer is not None: prim_buffer = self.primitive_collision_buffer.clone() if self.mesh_collision_buffer is not None: mesh_buffer = self.mesh_collision_buffer.clone() if self.blox_collision_buffer is not None: blox_buffer = self.blox_collision_buffer.clone() - return CollisionQueryBuffer(prim_buffer, mesh_buffer, blox_buffer, self.shape) + if self.voxel_collision_buffer is not None: + voxel_buffer = self.voxel_collision_buffer.clone() + return CollisionQueryBuffer( + prim_buffer, + mesh_buffer, + blox_buffer, + voxel_collision_buffer=voxel_buffer, + shape=self.shape, + ) @classmethod def initialize_from_shape( @@ -137,14 +158,18 @@ class CollisionQueryBuffer: tensor_args: TensorDeviceType, collision_types: Dict[str, bool], ): - primitive_buffer = mesh_buffer = blox_buffer = None + primitive_buffer = mesh_buffer = blox_buffer = voxel_buffer = None if "primitive" in collision_types and collision_types["primitive"]: primitive_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) if "mesh" in collision_types and collision_types["mesh"]: mesh_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) if "blox" in collision_types and collision_types["blox"]: blox_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) - return CollisionQueryBuffer(primitive_buffer, mesh_buffer, blox_buffer) + if "voxel" in collision_types and collision_types["voxel"]: + voxel_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) + return CollisionQueryBuffer( + primitive_buffer, mesh_buffer, blox_buffer, voxel_collision_buffer=voxel_buffer + ) def create_from_shape( self, @@ -160,8 +185,9 @@ class CollisionQueryBuffer: self.mesh_collision_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) if "blox" in collision_types and collision_types["blox"]: self.blox_collision_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) + if "voxel" in collision_types and collision_types["voxel"]: + self.voxel_collision_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args) self.shape = shape - # return self def update_buffer_shape( self, @@ -169,12 +195,10 @@ class CollisionQueryBuffer: tensor_args: TensorDeviceType, collision_types: Optional[Dict[str, bool]], ): - # print(shape, self.shape) # update buffers: assert len(shape) == 4 # shape is: batch, horizon, n_spheres, 4 if self.shape is None: # buffers not initialized: self.create_from_shape(shape, tensor_args, collision_types) - # print("Creating new memory", self.shape) else: # update buffers if shape doesn't match: # TODO: allow for dynamic change of collision_types @@ -185,6 +209,8 @@ class CollisionQueryBuffer: self.mesh_collision_buffer.update_buffer_shape(shape, tensor_args) if self.blox_collision_buffer is not None: self.blox_collision_buffer.update_buffer_shape(shape, tensor_args) + if self.voxel_collision_buffer is not None: + self.voxel_collision_buffer.update_buffer_shape(shape, tensor_args) self.shape = shape def get_gradient_buffer( @@ -208,6 +234,12 @@ class CollisionQueryBuffer: current_buffer = blox_buffer.clone() else: current_buffer += blox_buffer + if self.voxel_collision_buffer is not None: + voxel_buffer = self.voxel_collision_buffer.grad_distance_buffer + if current_buffer is None: + current_buffer = voxel_buffer.clone() + else: + current_buffer += voxel_buffer return current_buffer @@ -221,6 +253,7 @@ class CollisionCheckerType(Enum): PRIMITIVE = "PRIMITIVE" BLOX = "BLOX" MESH = "MESH" + VOXEL = "VOXEL" @dataclass @@ -230,11 +263,13 @@ class WorldCollisionConfig: cache: Optional[Dict[Obstacle, int]] = None n_envs: int = 1 checker_type: CollisionCheckerType = CollisionCheckerType.PRIMITIVE - max_distance: float = 0.01 + max_distance: Union[torch.Tensor, float] = 0.01 def __post_init__(self): if self.world_model is not None and isinstance(self.world_model, list): self.n_envs = len(self.world_model) + if isinstance(self.max_distance, float): + self.max_distance = self.tensor_args.to_device([self.max_distance]) @staticmethod def load_from_dict( @@ -261,6 +296,8 @@ class WorldCollision(WorldCollisionConfig): if config is not None: WorldCollisionConfig.__init__(self, **vars(config)) self.collision_types = {} # Use this dictionary to store collision types + self._cache_voxelization = None + self._cache_voxelization_collision_buffer = None def load_collision_model(self, world_model: WorldConfig): raise NotImplementedError @@ -273,6 +310,8 @@ class WorldCollision(WorldCollisionConfig): activation_distance: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None, return_loss: bool = False, + sum_collisions: bool = True, + compute_esdf: bool = False, ): """ Computes the signed distance via analytic function @@ -310,6 +349,7 @@ class WorldCollision(WorldCollisionConfig): enable_speed_metric=False, env_query_idx: Optional[torch.Tensor] = None, return_loss: bool = False, + sum_collisions: bool = True, ): raise NotImplementedError @@ -338,6 +378,118 @@ class WorldCollision(WorldCollisionConfig): ): raise NotImplementedError + def get_voxels_in_bounding_box( + self, + cuboid: Cuboid = Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]), + voxel_size: float = 0.02, + ) -> Union[List[Cuboid], torch.Tensor]: + new_grid = self.get_occupancy_in_bounding_box(cuboid, voxel_size) + occupied = new_grid.get_occupied_voxels(0.0) + return occupied + + def clear_voxelization_cache(self): + self._cache_voxelization = None + + def update_cache_voxelization(self, new_grid: VoxelGrid): + if ( + self._cache_voxelization is None + or self._cache_voxelization.voxel_size != new_grid.voxel_size + or self._cache_voxelization.dims != new_grid.dims + ): + self._cache_voxelization = new_grid + self._cache_voxelization.xyzr_tensor = self._cache_voxelization.create_xyzr_tensor( + transform_to_origin=True, tensor_args=self.tensor_args + ) + self._cache_voxelization_collision_buffer = CollisionQueryBuffer() + xyzr = self._cache_voxelization.xyzr_tensor.view(-1, 1, 1, 4) + + self._cache_voxelization_collision_buffer.update_buffer_shape( + xyzr.shape, + self.tensor_args, + self.collision_types, + ) + + def get_occupancy_in_bounding_box( + self, + cuboid: Cuboid = Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]), + voxel_size: float = 0.02, + ) -> VoxelGrid: + new_grid = VoxelGrid( + name=cuboid.name, dims=cuboid.dims, pose=cuboid.pose, voxel_size=voxel_size + ) + + self.update_cache_voxelization(new_grid) + + xyzr = self._cache_voxelization.xyzr_tensor + + xyzr = xyzr.view(-1, 1, 1, 4) + + weight = self.tensor_args.to_device([1.0]) + act_distance = self.tensor_args.to_device([0.0]) + + d_sph = self.get_sphere_collision( + xyzr, + self._cache_voxelization_collision_buffer, + weight, + act_distance, + ) + d_sph = d_sph.reshape(-1) + new_grid.xyzr_tensor = self._cache_voxelization.xyzr_tensor + new_grid.feature_tensor = d_sph + return new_grid + + def get_esdf_in_bounding_box( + self, + cuboid: Cuboid = Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]), + voxel_size: float = 0.02, + dtype=torch.float32, + ) -> VoxelGrid: + new_grid = VoxelGrid( + name=cuboid.name, + dims=cuboid.dims, + pose=cuboid.pose, + voxel_size=voxel_size, + feature_dtype=dtype, + ) + + self.update_cache_voxelization(new_grid) + + xyzr = self._cache_voxelization.xyzr_tensor + voxel_shape = xyzr.shape + xyzr = xyzr.view(-1, 1, 1, 4) + + weight = self.tensor_args.to_device([1.0]) + + d_sph = self.get_sphere_distance( + xyzr, + self._cache_voxelization_collision_buffer, + weight, + self.max_distance, + sum_collisions=False, + compute_esdf=True, + ) + + d_sph = d_sph.reshape(-1) + voxel_grid = self._cache_voxelization + voxel_grid.feature_tensor = d_sph + + return voxel_grid + + def get_mesh_in_bounding_box( + self, + cuboid: Cuboid = Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]), + voxel_size: float = 0.02, + ) -> Mesh: + voxels = self.get_voxels_in_bounding_box(cuboid, voxel_size) + # voxels = voxels.cpu().numpy() + # cuboids = [Cuboid(name="c_"+str(x), pose=[voxels[x,0],voxels[x,1],voxels[x,2], 1,0,0,0], dims=[voxel_size, voxel_size, voxel_size]) for x in range(voxels.shape[0])] + # mesh = WorldConfig(cuboid=cuboids).get_mesh_world(True).mesh[0] + mesh = Mesh.from_pointcloud( + voxels[:, :3].detach().cpu().numpy(), + pitch=voxel_size * 1.1, + ) + return mesh + class WorldPrimitiveCollision(WorldCollision): """World Oriented Bounding Box representation object @@ -354,6 +506,7 @@ class WorldPrimitiveCollision(WorldCollision): self._env_n_obbs = None self._env_obbs_names = None self._init_cache() + if self.world_model is not None: if isinstance(self.world_model, list): self.load_batch_collision_model(self.world_model) @@ -656,6 +809,8 @@ class WorldPrimitiveCollision(WorldCollision): activation_distance: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None, return_loss=False, + sum_collisions: bool = True, + compute_esdf: bool = False, ): if "primitive" not in self.collision_types or not self.collision_types["primitive"]: raise ValueError("Primitive Collision has no obstacles") @@ -673,6 +828,7 @@ class WorldPrimitiveCollision(WorldCollision): collision_query_buffer.primitive_collision_buffer.sparsity_index_buffer, weight, activation_distance, + self.max_distance, self._cube_tensor_list[0], self._cube_tensor_list[0], self._cube_tensor_list[1], @@ -687,6 +843,8 @@ class WorldPrimitiveCollision(WorldCollision): True, use_batch_env, return_loss, + sum_collisions, + compute_esdf, ) return dist @@ -699,6 +857,7 @@ class WorldPrimitiveCollision(WorldCollision): activation_distance: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None, return_loss=False, + **kwargs, ): if "primitive" not in self.collision_types or not self.collision_types["primitive"]: raise ValueError("Primitive Collision has no obstacles") @@ -717,6 +876,7 @@ class WorldPrimitiveCollision(WorldCollision): collision_query_buffer.primitive_collision_buffer.sparsity_index_buffer, weight, activation_distance, + self.max_distance, self._cube_tensor_list[0], self._cube_tensor_list[0], self._cube_tensor_list[1], @@ -728,7 +888,7 @@ class WorldPrimitiveCollision(WorldCollision): h, n, query_sphere.requires_grad, - False, + True, use_batch_env, return_loss, ) @@ -745,6 +905,7 @@ class WorldPrimitiveCollision(WorldCollision): enable_speed_metric=False, env_query_idx: Optional[torch.Tensor] = None, return_loss=False, + sum_collisions: bool = True, ): """ Computes the signed distance via analytic function @@ -784,6 +945,7 @@ class WorldPrimitiveCollision(WorldCollision): True, use_batch_env, return_loss, + sum_collisions, ) return dist @@ -836,7 +998,7 @@ class WorldPrimitiveCollision(WorldCollision): sweep_steps, enable_speed_metric, query_sphere.requires_grad, - False, + True, use_batch_env, return_loss, ) @@ -845,70 +1007,5 @@ class WorldPrimitiveCollision(WorldCollision): def clear_cache(self): if self._cube_tensor_list is not None: - self._cube_tensor_list[2][:] = 1 + self._cube_tensor_list[2][:] = 0 self._env_n_obbs[:] = 0 - - def get_voxels_in_bounding_box( - self, - cuboid: Cuboid, - voxel_size: float = 0.02, - ) -> Union[List[Cuboid], torch.Tensor]: - bounds = cuboid.dims - low = [-bounds[0], -bounds[1], -bounds[2]] - high = [bounds[0], bounds[1], bounds[2]] - trange = [h - l for l, h in zip(low, high)] - - x = torch.linspace( - -bounds[0], bounds[0], int(trange[0] // voxel_size) + 1, device=self.tensor_args.device - ) - y = torch.linspace( - -bounds[1], bounds[1], int(trange[1] // voxel_size) + 1, device=self.tensor_args.device - ) - z = torch.linspace( - -bounds[2], bounds[2], int(trange[2] // voxel_size) + 1, device=self.tensor_args.device - ) - w, l, h = x.shape[0], y.shape[0], z.shape[0] - xyz = ( - torch.stack(torch.meshgrid(x, y, z, indexing="ij")).permute((1, 2, 3, 0)).reshape(-1, 3) - ) - - pose = Pose.from_list(cuboid.pose, tensor_args=self.tensor_args) - xyz = pose.transform_points(xyz.contiguous()) - r = torch.zeros_like(xyz[:, 0:1]) + voxel_size - xyzr = torch.cat([xyz, r], dim=1) - xyzr = xyzr.reshape(-1, 1, 1, 4) - collision_buffer = CollisionQueryBuffer() - collision_buffer.update_buffer_shape( - xyzr.shape, - self.tensor_args, - self.collision_types, - ) - weight = self.tensor_args.to_device([1.0]) - act_distance = self.tensor_args.to_device([0.0]) - - d_sph = self.get_sphere_collision( - xyzr, - collision_buffer, - weight, - act_distance, - ) - d_sph = d_sph.reshape(-1) - xyzr = xyzr.reshape(-1, 4) - # get occupied voxels: - occupied = xyzr[d_sph > 0.0] - return occupied - - def get_mesh_in_bounding_box( - self, - cuboid: Cuboid, - voxel_size: float = 0.02, - ) -> Mesh: - voxels = self.get_voxels_in_bounding_box(cuboid, voxel_size) - # voxels = voxels.cpu().numpy() - # cuboids = [Cuboid(name="c_"+str(x), pose=[voxels[x,0],voxels[x,1],voxels[x,2], 1,0,0,0], dims=[voxel_size, voxel_size, voxel_size]) for x in range(voxels.shape[0])] - # mesh = WorldConfig(cuboid=cuboids).get_mesh_world(True).mesh[0] - mesh = Mesh.from_pointcloud( - voxels[:, :3].detach().cpu().numpy(), - pitch=voxel_size * 1.1, - ) - return mesh diff --git a/src/curobo/geom/sdf/world_blox.py b/src/curobo/geom/sdf/world_blox.py index e844637..4eb6576 100644 --- a/src/curobo/geom/sdf/world_blox.py +++ b/src/curobo/geom/sdf/world_blox.py @@ -176,6 +176,8 @@ class WorldBloxCollision(WorldMeshCollision): activation_distance: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None, return_loss: bool = False, + sum_collisions: bool = True, + compute_esdf: bool = False, ): if "blox" not in self.collision_types or not self.collision_types["blox"]: return super().get_sphere_distance( @@ -185,6 +187,8 @@ class WorldBloxCollision(WorldMeshCollision): activation_distance, env_query_idx, return_loss, + sum_collisions=sum_collisions, + compute_esdf=compute_esdf, ) d = self._get_blox_sdf( @@ -205,8 +209,13 @@ class WorldBloxCollision(WorldMeshCollision): activation_distance, env_query_idx, return_loss, + sum_collisions=sum_collisions, + compute_esdf=compute_esdf, ) - d = d + d_base + if compute_esdf: + d = torch.maximum(d, d_base) + else: + d = d + d_base return d @@ -262,6 +271,7 @@ class WorldBloxCollision(WorldMeshCollision): enable_speed_metric=False, env_query_idx: Optional[torch.Tensor] = None, return_loss: bool = False, + sum_collisions: bool = True, ): if "blox" not in self.collision_types or not self.collision_types["blox"]: return super().get_swept_sphere_distance( @@ -274,6 +284,7 @@ class WorldBloxCollision(WorldMeshCollision): enable_speed_metric, env_query_idx, return_loss=return_loss, + sum_collisions=sum_collisions, ) d = self._get_blox_swept_sdf( @@ -301,6 +312,7 @@ class WorldBloxCollision(WorldMeshCollision): enable_speed_metric, env_query_idx, return_loss=return_loss, + sum_collisions=sum_collisions, ) d = d + d_base diff --git a/src/curobo/geom/sdf/world_mesh.py b/src/curobo/geom/sdf/world_mesh.py index 3bf32ad..0835bf4 100644 --- a/src/curobo/geom/sdf/world_mesh.py +++ b/src/curobo/geom/sdf/world_mesh.py @@ -89,6 +89,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): self._mesh_tensor_list[0][env_idx, :max_nmesh] = w_mid self._mesh_tensor_list[1][env_idx, :max_nmesh, :7] = w_inv_pose self._mesh_tensor_list[2][env_idx, :max_nmesh] = 1 + self._mesh_tensor_list[2][env_idx, max_nmesh:] = 0 self._env_mesh_names[env_idx][:max_nmesh] = name_list self._env_n_mesh[env_idx] = max_nmesh @@ -355,6 +356,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): activation_distance: torch.Tensor, env_query_idx=None, return_loss=False, + compute_esdf=False, ): d = SdfMeshWarpPy.apply( query_spheres, @@ -370,6 +372,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): self.max_distance, env_query_idx, return_loss, + compute_esdf, ) return d @@ -397,9 +400,9 @@ class WorldMeshCollision(WorldPrimitiveCollision): self._mesh_tensor_list[1], self._mesh_tensor_list[2], self._env_n_mesh, + self.max_distance, sweep_steps, enable_speed_metric, - self.max_distance, env_query_idx, return_loss, ) @@ -413,6 +416,8 @@ class WorldMeshCollision(WorldPrimitiveCollision): activation_distance: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None, return_loss: bool = False, + sum_collisions: bool = True, + compute_esdf: bool = False, ): # TODO: if no mesh object exist, call primitive if "mesh" not in self.collision_types or not self.collision_types["mesh"]: @@ -423,6 +428,8 @@ class WorldMeshCollision(WorldPrimitiveCollision): activation_distance=activation_distance, env_query_idx=env_query_idx, return_loss=return_loss, + sum_collisions=sum_collisions, + compute_esdf=compute_esdf, ) d = self._get_sdf( @@ -432,6 +439,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): activation_distance=activation_distance, env_query_idx=env_query_idx, return_loss=return_loss, + compute_esdf=compute_esdf, ) if "primitive" not in self.collision_types or not self.collision_types["primitive"]: @@ -443,8 +451,13 @@ class WorldMeshCollision(WorldPrimitiveCollision): activation_distance=activation_distance, env_query_idx=env_query_idx, return_loss=return_loss, + sum_collisions=sum_collisions, + compute_esdf=compute_esdf, ) - d_val = d.view(d_prim.shape) + d_prim + if compute_esdf: + d_val = torch.maximum(d.view(d_prim.shape), d_prim) + else: + d_val = d.view(d_prim.shape) + d_prim return d_val def get_sphere_collision( @@ -455,6 +468,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): activation_distance: torch.Tensor, env_query_idx=None, return_loss=False, + **kwargs, ): if "mesh" not in self.collision_types or not self.collision_types["mesh"]: return super().get_sphere_collision( @@ -501,6 +515,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): enable_speed_metric=False, env_query_idx: Optional[torch.Tensor] = None, return_loss: bool = False, + sum_collisions: bool = True, ): # log_warn("Swept: Mesh + Primitive Collision Checking is experimental") if "mesh" not in self.collision_types or not self.collision_types["mesh"]: @@ -514,6 +529,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): speed_dt=speed_dt, enable_speed_metric=enable_speed_metric, return_loss=return_loss, + sum_collisions=sum_collisions, ) d = self._get_swept_sdf( @@ -540,6 +556,7 @@ class WorldMeshCollision(WorldPrimitiveCollision): speed_dt=speed_dt, enable_speed_metric=enable_speed_metric, return_loss=return_loss, + sum_collisions=sum_collisions, ) d_val = d.view(d_prim.shape) + d_prim @@ -602,4 +619,11 @@ class WorldMeshCollision(WorldPrimitiveCollision): self._wp_mesh_cache = {} if self._mesh_tensor_list is not None: self._mesh_tensor_list[2][:] = 0 + if self._env_n_mesh is not None: + self._env_n_mesh[:] = 0 + if self._env_mesh_names is not None: + self._env_mesh_names = [ + [None for _ in range(self.cache["mesh"])] for _ in range(self.n_envs) + ] + super().clear_cache() diff --git a/src/curobo/geom/sdf/world_voxel.py b/src/curobo/geom/sdf/world_voxel.py new file mode 100644 index 0000000..4fb6be4 --- /dev/null +++ b/src/curobo/geom/sdf/world_voxel.py @@ -0,0 +1,699 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# +# Standard Library +import math +from typing import Any, Dict, List, Optional + +# Third Party +import numpy as np +import torch + +# CuRobo +from curobo.curobolib.geom import SdfSphereVoxel, SdfSweptSphereVoxel +from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollisionConfig +from curobo.geom.sdf.world_mesh import WorldMeshCollision +from curobo.geom.types import VoxelGrid, WorldConfig +from curobo.types.math import Pose +from curobo.util.logger import log_error, log_info, log_warn + + +class WorldVoxelCollision(WorldMeshCollision): + """Voxel grid representation of World, with each voxel containing Euclidean Signed Distance.""" + + def __init__(self, config: WorldCollisionConfig): + self._env_n_voxels = None + self._voxel_tensor_list = None + self._env_voxel_names = None + + super().__init__(config) + + def _init_cache(self): + if ( + self.cache is not None + and "voxel" in self.cache + and self.cache["voxel"] not in [None, 0] + ): + self._create_voxel_cache(self.cache["voxel"]) + + def _create_voxel_cache(self, voxel_cache: Dict[str, Any]): + n_layers = voxel_cache["layers"] + dims = voxel_cache["dims"] + voxel_size = voxel_cache["voxel_size"] + feature_dtype = voxel_cache["feature_dtype"] + n_voxels = int( + math.floor(dims[0] / voxel_size) + * math.floor(dims[1] / voxel_size) + * math.floor(dims[2] / voxel_size) + ) + + voxel_params = torch.zeros( + (self.n_envs, n_layers, 4), + dtype=self.tensor_args.dtype, + device=self.tensor_args.device, + ) + voxel_pose = torch.zeros( + (self.n_envs, n_layers, 8), + dtype=self.tensor_args.dtype, + device=self.tensor_args.device, + ) + voxel_pose[..., 3] = 1.0 + voxel_enable = torch.zeros( + (self.n_envs, n_layers), dtype=torch.uint8, device=self.tensor_args.device + ) + self._env_n_voxels = torch.zeros( + (self.n_envs), device=self.tensor_args.device, dtype=torch.int32 + ) + voxel_features = torch.zeros( + (self.n_envs, n_layers, n_voxels, 1), + device=self.tensor_args.device, + dtype=feature_dtype, + ) + + self._voxel_tensor_list = [voxel_params, voxel_pose, voxel_enable, voxel_features] + self.collision_types["voxel"] = True + self._env_voxel_names = [[None for _ in range(n_layers)] for _ in range(self.n_envs)] + + def load_collision_model( + self, world_model: WorldConfig, env_idx=0, fix_cache_reference: bool = False + ): + self._load_collision_model_in_cache( + world_model, env_idx, fix_cache_reference=fix_cache_reference + ) + return super().load_collision_model( + world_model, env_idx=env_idx, fix_cache_reference=fix_cache_reference + ) + + def _load_collision_model_in_cache( + self, world_config: WorldConfig, env_idx: int = 0, fix_cache_reference: bool = False + ): + """TODO: + + _extended_summary_ + + Args: + world_config: _description_ + env_idx: _description_ + fix_cache_reference: _description_ + """ + voxel_objs = world_config.voxel + max_obs = len(voxel_objs) + self.world_model = world_config + if max_obs < 1: + log_info("No Voxel objs") + return + if self._voxel_tensor_list is None or self._voxel_tensor_list[0].shape[1] < max_obs: + if not fix_cache_reference: + log_info("Creating Voxel cache" + str(max_obs)) + self._create_voxel_cache( + { + "layers": max_obs, + "dims": voxel_objs[0].dims, + "voxel_size": voxel_objs[0].voxel_size, + "feature_dtype": voxel_objs[0].feature_dtype, + } + ) + else: + log_error("number of OBB is larger than collision cache, create larger cache.") + + # load as a batch: + pose_batch = [c.pose for c in voxel_objs] + dims_batch = [c.dims for c in voxel_objs] + names_batch = [c.name for c in voxel_objs] + size_batch = [c.voxel_size for c in voxel_objs] + voxel_batch = self._batch_tensor_voxel(pose_batch, dims_batch, size_batch) + self._voxel_tensor_list[0][env_idx, :max_obs, :] = voxel_batch[0] + self._voxel_tensor_list[1][env_idx, :max_obs, :7] = voxel_batch[1] + + self._voxel_tensor_list[2][env_idx, :max_obs] = 1 # enabling obstacle + + self._voxel_tensor_list[2][env_idx, max_obs:] = 0 # disabling obstacle + + # copy voxel grid features: + + self._env_n_voxels[env_idx] = max_obs + self._env_voxel_names[env_idx][:max_obs] = names_batch + self.collision_types["voxel"] = True + + def _batch_tensor_voxel( + self, pose: List[List[float]], dims: List[float], voxel_size: List[float] + ): + w_T_b = Pose.from_batch_list(pose, tensor_args=self.tensor_args) + b_T_w = w_T_b.inverse() + dims_t = torch.as_tensor( + np.array(dims), device=self.tensor_args.device, dtype=self.tensor_args.dtype + ) + size_t = torch.as_tensor( + np.array(voxel_size), device=self.tensor_args.device, dtype=self.tensor_args.dtype + ).unsqueeze(-1) + params_t = torch.cat([dims_t, size_t], dim=-1) + + voxel_list = [params_t, b_T_w.get_pose_vector()] + return voxel_list + + def load_batch_collision_model(self, world_config_list: List[WorldConfig]): + """Load voxel grid for batched environments + + _extended_summary_ + + Args: + world_config_list: _description_ + + Returns: + _description_ + """ + log_error("Not Implemented") + # First find largest number of cuboid: + c_len = [] + pose_batch = [] + dims_batch = [] + names_batch = [] + vsize_batch = [] + for i in world_config_list: + c = i.cuboid + if c is not None: + c_len.append(len(c)) + pose_batch.extend([i.pose for i in c]) + dims_batch.extend([i.dims for i in c]) + names_batch.extend([i.name for i in c]) + vsize_batch.extend([i.voxel_size for i in c]) + else: + c_len.append(0) + + max_obs = max(c_len) + if max_obs < 1: + log_warn("No obbs found") + return + # check if number of environments is same as config: + reset_buffers = False + if self._env_n_voxels is not None and len(world_config_list) != len(self._env_n_voxels): + log_warn( + "env_n_voxels is not same as world_config_list, reloading collision buffers (breaks CG)" + ) + reset_buffers = True + self.n_envs = len(world_config_list) + self._env_n_voxels = torch.zeros( + (self.n_envs), device=self.tensor_args.device, dtype=torch.int32 + ) + + if self._voxel_tensor_list is not None and self._voxel_tensor_list[0].shape[1] < max_obs: + log_warn( + "number of obbs is greater than buffer, reloading collision buffers (breaks CG)" + ) + reset_buffers = True + # create cache if does not exist: + if self._voxel_tensor_list is None or reset_buffers: + log_info("Creating Obb cache" + str(max_obs)) + self._create_obb_cache(max_obs) + + # load obstacles: + ## load data into gpu: + voxel_batch = self._batch_tensor_voxel(pose_batch, dims_batch, vsize_batch) + c_start = 0 + for i in range(len(self._env_n_voxels)): + if c_len[i] > 0: + # load obb: + self._voxel_tensor_list[0][i, : c_len[i], :] = voxel_batch[0][ + c_start : c_start + c_len[i] + ] + self._voxel_tensor_list[1][i, : c_len[i], :7] = voxel_batch[1][ + c_start : c_start + c_len[i] + ] + self._voxel_tensor_list[2][i, : c_len[i]] = 1 + self._env_voxel_names[i][: c_len[i]] = names_batch[c_start : c_start + c_len[i]] + self._voxel_tensor_list[2][i, c_len[i] :] = 0 + c_start += c_len[i] + self._env_n_voxels[:] = torch.as_tensor( + c_len, dtype=torch.int32, device=self.tensor_args.device + ) + self.collision_types["voxel"] = True + + return super().load_batch_collision_model(world_config_list) + + def enable_obstacle( + self, + name: str, + enable: bool = True, + env_idx: int = 0, + ): + if self._env_voxel_names is not None and name in self._env_voxel_names[env_idx]: + self.enable_voxel(enable, name, None, env_idx) + else: + return super().enable_obstacle(name, enable, env_idx) + + def enable_voxel( + self, + enable: bool = True, + name: Optional[str] = None, + env_obj_idx: Optional[torch.Tensor] = None, + env_idx: int = 0, + ): + """Update obstacle dimensions + + Args: + obj_dims (torch.Tensor): [dim.x,dim.y, dim.z], give as [b,3] + obj_idx (torch.Tensor or int): + + """ + if env_obj_idx is not None: + self._voxel_tensor_list[2][env_obj_idx] = int(enable) # enable == 1 + else: + # find index of given name: + obs_idx = self.get_voxel_idx(name, env_idx) + + self._voxel_tensor_list[2][env_idx, obs_idx] = int(enable) + + def update_obstacle_pose( + self, + name: str, + w_obj_pose: Pose, + env_idx: int = 0, + ): + if self._env_voxel_names is not None and name in self._env_voxel_names[env_idx]: + self.update_voxel_pose(name=name, w_obj_pose=w_obj_pose, env_idx=env_idx) + else: + log_error("obstacle not found in OBB world model: " + name) + + def update_voxel_data(self, new_voxel: VoxelGrid, env_idx: int = 0): + obs_idx = self.get_voxel_idx(new_voxel.name, env_idx) + self._voxel_tensor_list[3][env_idx, obs_idx, :, :] = new_voxel.feature_tensor.view( + new_voxel.feature_tensor.shape[0], -1 + ).to(dtype=self._voxel_tensor_list[3].dtype) + self._voxel_tensor_list[0][env_idx, obs_idx, :3] = self.tensor_args.to_device( + new_voxel.dims + ) + self._voxel_tensor_list[0][env_idx, obs_idx, 3] = new_voxel.voxel_size + self._voxel_tensor_list[1][env_idx, obs_idx, :7] = ( + Pose.from_list(new_voxel.pose, self.tensor_args).inverse().get_pose_vector() + ) + self._voxel_tensor_list[2][env_idx, obs_idx] = int(True) + + def update_voxel_features( + self, + features: torch.Tensor, + name: Optional[str] = None, + env_obj_idx: Optional[torch.Tensor] = None, + env_idx: int = 0, + ): + """Update pose of a specific objects. + This also updates the signed distance grid to account for the updated object pose. + Args: + obj_w_pose: Pose + obj_idx: + """ + if env_obj_idx is not None: + self._voxel_tensor_list[3][env_obj_idx, :] = features.to( + dtype=self._voxel_tensor_list[3].dtype + ) + else: + obs_idx = self.get_voxel_idx(name, env_idx) + self._voxel_tensor_list[3][env_idx, obs_idx, :] = features.to( + dtype=self._voxel_tensor_list[3].dtype + ) + + def update_voxel_pose( + self, + w_obj_pose: Optional[Pose] = None, + obj_w_pose: Optional[Pose] = None, + name: Optional[str] = None, + env_obj_idx: Optional[torch.Tensor] = None, + env_idx: int = 0, + ): + """Update pose of a specific objects. + This also updates the signed distance grid to account for the updated object pose. + Args: + obj_w_pose: Pose + obj_idx: + """ + obj_w_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose) + if env_obj_idx is not None: + self._voxel_tensor_list[1][env_obj_idx, :7] = obj_w_pose.get_pose_vector() + else: + obs_idx = self.get_voxel_idx(name, env_idx) + self._voxel_tensor_list[1][env_idx, obs_idx, :7] = obj_w_pose.get_pose_vector() + + def get_voxel_idx( + self, + name: str, + env_idx: int = 0, + ) -> int: + if name not in self._env_voxel_names[env_idx]: + log_error("Obstacle with name: " + name + " not found in current world", exc_info=True) + return self._env_voxel_names[env_idx].index(name) + + def get_voxel_grid( + self, + name: str, + env_idx: int = 0, + ): + obs_idx = self.get_voxel_idx(name, env_idx) + voxel_params = np.round( + self._voxel_tensor_list[0][env_idx, obs_idx, :].cpu().numpy().astype(np.float64), 6 + ).tolist() + voxel_pose = Pose( + position=self._voxel_tensor_list[1][env_idx, obs_idx, :3], + quaternion=self._voxel_tensor_list[1][env_idx, obs_idx, 3:7], + ) + voxel_features = self._voxel_tensor_list[3][env_idx, obs_idx, :] + voxel_grid = VoxelGrid( + name=name, + dims=voxel_params[:3], + pose=voxel_pose.to_list(), + voxel_size=voxel_params[3], + feature_tensor=voxel_features, + ) + return voxel_grid + + def get_sphere_distance( + self, + query_sphere, + collision_query_buffer: CollisionQueryBuffer, + weight: torch.Tensor, + activation_distance: torch.Tensor, + env_query_idx: Optional[torch.Tensor] = None, + return_loss=False, + sum_collisions: bool = True, + compute_esdf: bool = False, + ): + if "voxel" not in self.collision_types or not self.collision_types["voxel"]: + return super().get_sphere_distance( + query_sphere, + collision_query_buffer, + weight, + activation_distance, + env_query_idx=env_query_idx, + return_loss=return_loss, + sum_collisions=sum_collisions, + compute_esdf=compute_esdf, + ) + + b, h, n, _ = query_sphere.shape # This can be read from collision query buffer + use_batch_env = True + if env_query_idx is None: + use_batch_env = False + env_query_idx = self._env_n_voxels + dist = SdfSphereVoxel.apply( + query_sphere, + collision_query_buffer.voxel_collision_buffer.distance_buffer, + collision_query_buffer.voxel_collision_buffer.grad_distance_buffer, + collision_query_buffer.voxel_collision_buffer.sparsity_index_buffer, + weight, + activation_distance, + self.max_distance, + self._voxel_tensor_list[3], + self._voxel_tensor_list[0], + self._voxel_tensor_list[1], + self._voxel_tensor_list[2], + self._env_n_voxels, + env_query_idx, + self._voxel_tensor_list[0].shape[1], + b, + h, + n, + query_sphere.requires_grad, + True, + use_batch_env, + return_loss, + sum_collisions, + compute_esdf, + ) + + if ( + "primitive" not in self.collision_types + or not self.collision_types["primitive"] + or "mesh" not in self.collision_types + or not self.collision_types["mesh"] + ): + return dist + d_prim = super().get_sphere_distance( + query_sphere, + collision_query_buffer, + weight=weight, + activation_distance=activation_distance, + env_query_idx=env_query_idx, + return_loss=return_loss, + sum_collisions=sum_collisions, + compute_esdf=compute_esdf, + ) + if compute_esdf: + d_val = torch.maximum(dist.view(d_prim.shape), d_prim) + else: + d_val = d_val.view(d_prim.shape) + d_prim + + return d_val + + def get_sphere_collision( + self, + query_sphere, + collision_query_buffer: CollisionQueryBuffer, + weight: torch.Tensor, + activation_distance: torch.Tensor, + env_query_idx: Optional[torch.Tensor] = None, + return_loss=False, + **kwargs, + ): + if "voxel" not in self.collision_types or not self.collision_types["voxel"]: + return super().get_sphere_collision( + query_sphere, + collision_query_buffer, + weight, + activation_distance, + env_query_idx=env_query_idx, + return_loss=return_loss, + ) + + if return_loss: + raise ValueError("cannot return loss for classification, use get_sphere_distance") + b, h, n, _ = query_sphere.shape + use_batch_env = True + if env_query_idx is None: + use_batch_env = False + env_query_idx = self._env_n_voxels + dist = SdfSphereVoxel.apply( + query_sphere, + collision_query_buffer.voxel_collision_buffer.distance_buffer, + collision_query_buffer.voxel_collision_buffer.grad_distance_buffer, + collision_query_buffer.voxel_collision_buffer.sparsity_index_buffer, + weight, + activation_distance, + self.max_distance, + self._voxel_tensor_list[3], + self._voxel_tensor_list[0], + self._voxel_tensor_list[1], + self._voxel_tensor_list[2], + self._env_n_voxels, + env_query_idx, + self._voxel_tensor_list[0].shape[1], + b, + h, + n, + query_sphere.requires_grad, + False, + use_batch_env, + False, + True, + False, + ) + if ( + "primitive" not in self.collision_types + or not self.collision_types["primitive"] + or "mesh" not in self.collision_types + or not self.collision_types["mesh"] + ): + return dist + d_prim = super().get_sphere_collision( + query_sphere, + collision_query_buffer, + weight, + activation_distance=activation_distance, + env_query_idx=env_query_idx, + return_loss=return_loss, + ) + d_val = dist.view(d_prim.shape) + d_prim + return d_val + + def get_swept_sphere_distance( + self, + query_sphere, + collision_query_buffer: CollisionQueryBuffer, + weight: torch.Tensor, + activation_distance: torch.Tensor, + speed_dt: torch.Tensor, + sweep_steps: int, + enable_speed_metric=False, + env_query_idx: Optional[torch.Tensor] = None, + return_loss=False, + sum_collisions: bool = True, + ): + """ + Computes the signed distance via analytic function + Args: + tensor_sphere: b, n, 4 + """ + if "voxel" not in self.collision_types or not self.collision_types["voxel"]: + return super().get_swept_sphere_distance( + query_sphere, + collision_query_buffer, + weight=weight, + env_query_idx=env_query_idx, + sweep_steps=sweep_steps, + activation_distance=activation_distance, + speed_dt=speed_dt, + enable_speed_metric=enable_speed_metric, + return_loss=return_loss, + sum_collisions=sum_collisions, + ) + b, h, n, _ = query_sphere.shape + use_batch_env = True + if env_query_idx is None: + use_batch_env = False + env_query_idx = self._env_n_voxels + + dist = SdfSweptSphereVoxel.apply( + query_sphere, + collision_query_buffer.voxel_collision_buffer.distance_buffer, + collision_query_buffer.voxel_collision_buffer.grad_distance_buffer, + collision_query_buffer.voxel_collision_buffer.sparsity_index_buffer, + weight, + activation_distance, + self.max_distance, + speed_dt, + self._voxel_tensor_list[3], + self._voxel_tensor_list[0], + self._voxel_tensor_list[1], + self._voxel_tensor_list[2], + self._env_n_voxels, + env_query_idx, + self._voxel_tensor_list[0].shape[1], + b, + h, + n, + sweep_steps, + enable_speed_metric, + query_sphere.requires_grad, + True, + use_batch_env, + return_loss, + sum_collisions, + ) + + if ( + "primitive" not in self.collision_types + or not self.collision_types["primitive"] + or "mesh" not in self.collision_types + or not self.collision_types["mesh"] + ): + return dist + d_prim = super().get_swept_sphere_distance( + query_sphere, + collision_query_buffer, + weight=weight, + env_query_idx=env_query_idx, + sweep_steps=sweep_steps, + activation_distance=activation_distance, + speed_dt=speed_dt, + enable_speed_metric=enable_speed_metric, + return_loss=return_loss, + sum_collisions=sum_collisions, + ) + + d_val = dist.view(d_prim.shape) + d_prim + return d_val + + def get_swept_sphere_collision( + self, + query_sphere, + collision_query_buffer: CollisionQueryBuffer, + weight: torch.Tensor, + activation_distance: torch.Tensor, + speed_dt: torch.Tensor, + sweep_steps: int, + enable_speed_metric=False, + env_query_idx: Optional[torch.Tensor] = None, + return_loss=False, + ): + """ + Computes the signed distance via analytic function + Args: + tensor_sphere: b, n, 4 + """ + if "voxel" not in self.collision_types or not self.collision_types["voxel"]: + return super().get_swept_sphere_collision( + query_sphere, + collision_query_buffer, + weight=weight, + env_query_idx=env_query_idx, + sweep_steps=sweep_steps, + activation_distance=activation_distance, + speed_dt=speed_dt, + enable_speed_metric=enable_speed_metric, + return_loss=return_loss, + ) + if return_loss: + raise ValueError("cannot return loss for classify, use get_swept_sphere_distance") + b, h, n, _ = query_sphere.shape + + use_batch_env = True + if env_query_idx is None: + use_batch_env = False + env_query_idx = self._env_n_voxels + dist = SdfSweptSphereVoxel.apply( + query_sphere, + collision_query_buffer.voxel_collision_buffer.distance_buffer, + collision_query_buffer.voxel_collision_buffer.grad_distance_buffer, + collision_query_buffer.voxel_collision_buffer.sparsity_index_buffer, + weight, + activation_distance, + self.max_distance, + speed_dt, + self._voxel_tensor_list[3], + self._voxel_tensor_list[0], + self._voxel_tensor_list[1], + self._voxel_tensor_list[2], + self._env_n_voxels, + env_query_idx, + self._voxel_tensor_list[0].shape[1], + b, + h, + n, + sweep_steps, + enable_speed_metric, + query_sphere.requires_grad, + False, + use_batch_env, + return_loss, + True, + ) + if ( + "primitive" not in self.collision_types + or not self.collision_types["primitive"] + or "mesh" not in self.collision_types + or not self.collision_types["mesh"] + ): + return dist + d_prim = super().get_swept_sphere_collision( + query_sphere, + collision_query_buffer, + weight=weight, + env_query_idx=env_query_idx, + sweep_steps=sweep_steps, + activation_distance=activation_distance, + speed_dt=speed_dt, + enable_speed_metric=enable_speed_metric, + return_loss=return_loss, + ) + d_val = dist.view(d_prim.shape) + d_prim + return d_val + + def clear_cache(self): + if self._voxel_tensor_list is not None: + self._voxel_tensor_list[2][:] = 0 + self._voxel_tensor_list[-1][:] = -1.0 * self.max_distance + self._env_n_voxels[:] = 0 diff --git a/src/curobo/geom/transform.py b/src/curobo/geom/transform.py index fc1654b..fd95671 100644 --- a/src/curobo/geom/transform.py +++ b/src/curobo/geom/transform.py @@ -18,6 +18,7 @@ import warp as wp # CuRobo from curobo.curobolib.kinematics import rotation_matrix_to_quaternion from curobo.util.logger import log_error +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util.warp import init_warp @@ -27,11 +28,11 @@ def transform_points( if out_points is None: out_points = torch.zeros((points.shape[0], 3), device=points.device, dtype=points.dtype) if out_gp is None: - out_gp = torch.zeros((position.shape[0], 3), device=position.device) + out_gp = torch.zeros((position.shape[0], 3), device=position.device, dtype=points.dtype) if out_gq is None: - out_gq = torch.zeros((quaternion.shape[0], 4), device=quaternion.device) + out_gq = torch.zeros((quaternion.shape[0], 4), device=quaternion.device, dtype=points.dtype) if out_gpt is None: - out_gpt = torch.zeros((points.shape[0], 3), device=position.device) + out_gpt = torch.zeros((points.shape[0], 3), device=position.device, dtype=points.dtype) out_points = TransformPoint.apply( position, quaternion, points, out_points, out_gp, out_gq, out_gpt ) @@ -46,18 +47,20 @@ def batch_transform_points( (points.shape[0], points.shape[1], 3), device=points.device, dtype=points.dtype ) if out_gp is None: - out_gp = torch.zeros((position.shape[0], 3), device=position.device) + out_gp = torch.zeros((position.shape[0], 3), device=position.device, dtype=points.dtype) if out_gq is None: - out_gq = torch.zeros((quaternion.shape[0], 4), device=quaternion.device) + out_gq = torch.zeros((quaternion.shape[0], 4), device=quaternion.device, dtype=points.dtype) if out_gpt is None: - out_gpt = torch.zeros((points.shape[0], points.shape[1], 3), device=position.device) + out_gpt = torch.zeros( + (points.shape[0], points.shape[1], 3), device=position.device, dtype=points.dtype + ) out_points = BatchTransformPoint.apply( position, quaternion, points, out_points, out_gp, out_gq, out_gpt ) return out_points -@torch.jit.script +@get_torch_jit_decorator() def get_inv_transform(w_rot_c, w_trans_c): # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor] c_rot_w = w_rot_c.transpose(-1, -2) @@ -65,7 +68,7 @@ def get_inv_transform(w_rot_c, w_trans_c): return c_rot_w, c_trans_w -@torch.jit.script +@get_torch_jit_decorator() def transform_point_inverse(point, rot, trans): # type: (Tensor, Tensor, Tensor) -> Tensor diff --git a/src/curobo/geom/types.py b/src/curobo/geom/types.py index 08bd604..3d01099 100644 --- a/src/curobo/geom/types.py +++ b/src/curobo/geom/types.py @@ -11,6 +11,7 @@ from __future__ import annotations # Standard Library +import math from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Sequence, Union @@ -564,6 +565,68 @@ class PointCloud(Obstacle): return new_spheres +@dataclass +class VoxelGrid(Obstacle): + dims: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0]) + voxel_size: float = 0.02 # meters + feature_tensor: Optional[torch.Tensor] = None + xyzr_tensor: Optional[torch.Tensor] = None + feature_dtype: torch.dtype = torch.float32 + + def __post_init__(self): + if self.feature_tensor is not None: + self.feature_dtype = self.feature_tensor.dtype + + def create_xyzr_tensor( + self, transform_to_origin: bool = False, tensor_args: TensorDeviceType = TensorDeviceType() + ): + bounds = self.dims + low = [-bounds[0] / 2, -bounds[1] / 2, -bounds[2] / 2] + high = [bounds[0] / 2, bounds[1] / 2, bounds[2] / 2] + trange = [h - l for l, h in zip(low, high)] + x = torch.linspace( + low[0], high[0], int(math.floor(trange[0] / self.voxel_size)), device=tensor_args.device + ) + y = torch.linspace( + low[1], high[1], int(math.floor(trange[1] / self.voxel_size)), device=tensor_args.device + ) + z = torch.linspace( + low[2], high[2], int(math.floor(trange[2] / self.voxel_size)), device=tensor_args.device + ) + w, l, h = x.shape[0], y.shape[0], z.shape[0] + xyz = ( + torch.stack(torch.meshgrid(x, y, z, indexing="ij")).permute((1, 2, 3, 0)).reshape(-1, 3) + ) + + if transform_to_origin: + pose = Pose.from_list(self.pose, tensor_args=tensor_args) + xyz = pose.transform_points(xyz.contiguous()) + r = torch.zeros_like(xyz[:, 0:1]) + (self.voxel_size * 0.5) + xyzr = torch.cat([xyz, r], dim=1) + return xyzr + + def get_occupied_voxels(self, feature_threshold: Optional[float] = None): + if feature_threshold is None: + feature_threshold = -1.0 * self.voxel_size + if self.xyzr_tensor is None or self.feature_tensor is None: + log_error("Feature tensor or xyzr tensor is empty") + xyzr = self.xyzr_tensor.clone() + xyzr[:, 3] = self.feature_tensor + occupied = xyzr[self.feature_tensor > feature_threshold] + return occupied + + def clone(self): + return VoxelGrid( + name=self.name, + pose=self.pose.copy(), + dims=self.dims.copy(), + feature_tensor=self.feature_tensor.clone() if self.feature_tensor is not None else None, + xyzr_tensor=self.xyzr_tensor.clone() if self.xyzr_tensor is not None else None, + feature_dtype=self.feature_dtype, + voxel_size=self.voxel_size, + ) + + @dataclass class WorldConfig(Sequence): """Representation of World for use in CuRobo.""" @@ -586,25 +649,13 @@ class WorldConfig(Sequence): #: BloxMap obstacle. blox: Optional[List[BloxMap]] = None + voxel: Optional[List[VoxelGrid]] = None + #: List of all obstacles in world. objects: Optional[List[Obstacle]] = None def __post_init__(self): # create objects list: - if self.objects is None: - self.objects = [] - if self.sphere is not None: - self.objects += self.sphere - if self.cuboid is not None: - self.objects += self.cuboid - if self.capsule is not None: - self.objects += self.capsule - if self.mesh is not None: - self.objects += self.mesh - if self.blox is not None: - self.objects += self.blox - if self.cylinder is not None: - self.objects += self.cylinder if self.sphere is None: self.sphere = [] if self.cuboid is None: @@ -617,6 +668,18 @@ class WorldConfig(Sequence): self.cylinder = [] if self.blox is None: self.blox = [] + if self.voxel is None: + self.voxel = [] + if self.objects is None: + self.objects = ( + self.sphere + + self.cuboid + + self.capsule + + self.mesh + + self.cylinder + + self.blox + + self.voxel + ) def __len__(self): return len(self.objects) @@ -632,6 +695,7 @@ class WorldConfig(Sequence): capsule=self.capsule.copy() if self.capsule is not None else None, cylinder=self.cylinder.copy() if self.cylinder is not None else None, blox=self.blox.copy() if self.blox is not None else None, + voxel=self.voxel.copy() if self.voxel is not None else None, ) @staticmethod @@ -642,6 +706,7 @@ class WorldConfig(Sequence): mesh = None blox = None cylinder = None + voxel = None # load yaml: if "cuboid" in data_dict.keys(): cuboid = [Cuboid(name=x, **data_dict["cuboid"][x]) for x in data_dict["cuboid"]] @@ -655,6 +720,8 @@ class WorldConfig(Sequence): cylinder = [Cylinder(name=x, **data_dict["cylinder"][x]) for x in data_dict["cylinder"]] if "blox" in data_dict.keys(): blox = [BloxMap(name=x, **data_dict["blox"][x]) for x in data_dict["blox"]] + if "voxel" in data_dict.keys(): + voxel = [VoxelGrid(name=x, **data_dict["voxel"][x]) for x in data_dict["voxel"]] return WorldConfig( cuboid=cuboid, @@ -663,6 +730,7 @@ class WorldConfig(Sequence): cylinder=cylinder, mesh=mesh, blox=blox, + voxel=voxel, ) # load world config as obbs: convert all types to obbs @@ -688,6 +756,10 @@ class WorldConfig(Sequence): if current_world.mesh is not None and len(current_world.mesh) > 0: mesh_obb = [x.get_cuboid() for x in current_world.mesh] + + if current_world.voxel is not None and len(current_world.voxel) > 0: + log_error("VoxelGrid cannot be converted to obb world") + return WorldConfig( cuboid=cuboid_obb + sphere_obb + capsule_obb + cylinder_obb + mesh_obb + blox_obb ) @@ -714,6 +786,8 @@ class WorldConfig(Sequence): for i in range(len(current_world.blox)): if current_world.blox[i].mesh is not None: blox_obb.append(current_world.blox[i].get_mesh(process=process)) + if current_world.voxel is not None and len(current_world.voxel) > 0: + log_error("VoxelGrid cannot be converted to mesh world") return WorldConfig( mesh=current_world.mesh @@ -750,6 +824,7 @@ class WorldConfig(Sequence): return WorldConfig( mesh=current_world.mesh + sphere_obb + capsule_obb + cylinder_obb + blox_obb, cuboid=cuboid_obb, + voxel=current_world.voxel, ) @staticmethod @@ -822,6 +897,8 @@ class WorldConfig(Sequence): self.cylinder.append(obstacle) elif isinstance(obstacle, Capsule): self.capsule.append(obstacle) + elif isinstance(obstacle, VoxelGrid): + self.voxel.append(obstacle) else: ValueError("Obstacle type not supported") self.objects.append(obstacle) diff --git a/src/curobo/graph/graph_base.py b/src/curobo/graph/graph_base.py index 73ae013..c58acae 100644 --- a/src/curobo/graph/graph_base.py +++ b/src/curobo/graph/graph_base.py @@ -33,6 +33,7 @@ from curobo.types.base import TensorDeviceType from curobo.types.robot import JointState, RobotConfig, State from curobo.util.logger import log_info, log_warn from curobo.util.sample_lib import HaltonGenerator +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util.trajectory import InterpolateType, get_interpolated_trajectory from curobo.util_file import ( get_robot_configs_path, @@ -1029,7 +1030,7 @@ class GraphPlanBase(GraphConfig): pass -@torch.jit.script +@get_torch_jit_decorator(dynamic=True) def get_unique_nodes(dist_node: torch.Tensor, nodes: torch.Tensor, node_distance: float): node_flag = dist_node <= node_distance dist_node[node_flag] = 0.0 @@ -1042,7 +1043,7 @@ def get_unique_nodes(dist_node: torch.Tensor, nodes: torch.Tensor, node_distance return unique_nodes, n_inv -@torch.jit.script +@get_torch_jit_decorator(force_jit=True, dynamic=True) def add_new_nodes_jit( nodes, new_nodes, flag, cat_buffer, path, idx, i: int, dof: int ) -> Tuple[torch.Tensor, torch.Tensor, int]: @@ -1066,7 +1067,7 @@ def add_new_nodes_jit( return path, node_set, new_nodes.shape[0] -@torch.jit.script +@get_torch_jit_decorator(force_jit=True, dynamic=True) def add_all_nodes_jit( nodes, cat_buffer, path, i: int, dof: int ) -> Tuple[torch.Tensor, torch.Tensor, int]: @@ -1084,20 +1085,20 @@ def add_all_nodes_jit( return path, node_set, nodes.shape[0] -@torch.jit.script +@get_torch_jit_decorator(force_jit=True, dynamic=True) def compute_distance_norm_jit(pt, batch_pts, distance_weight): vec = (batch_pts - pt) * distance_weight dist = torch.norm(vec, dim=-1) return dist -@torch.jit.script +@get_torch_jit_decorator(dynamic=True) def compute_distance_jit(pt, batch_pts, distance_weight): vec = (batch_pts - pt) * distance_weight return vec -@torch.jit.script +@get_torch_jit_decorator(dynamic=True) def compute_rotation_frame_jit( x_start: torch.Tensor, x_goal: torch.Tensor, rot_frame_col: torch.Tensor ) -> torch.Tensor: @@ -1114,7 +1115,7 @@ def compute_rotation_frame_jit( return C -@torch.jit.script +@get_torch_jit_decorator(force_jit=True, dynamic=True) def biased_vertex_projection_jit( x_start, x_goal, @@ -1144,7 +1145,7 @@ def biased_vertex_projection_jit( return x_samples -@torch.jit.script +@get_torch_jit_decorator(force_jit=True, dynamic=True) def cat_xc_jit(x, n: int): c = x[:, 0:1] * 0.0 xc_search = torch.cat((x, c), dim=1)[:n, :] diff --git a/src/curobo/opt/newton/lbfgs.py b/src/curobo/opt/newton/lbfgs.py index 2017ee9..e5e59d7 100644 --- a/src/curobo/opt/newton/lbfgs.py +++ b/src/curobo/opt/newton/lbfgs.py @@ -20,12 +20,13 @@ import torch.autograd.profiler as profiler # CuRobo from curobo.curobolib.opt import LBFGScu from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig -from curobo.util.logger import log_warn +from curobo.util.logger import log_info +from curobo.util.torch_utils import get_torch_jit_decorator # kernel for l-bfgs: -# @torch.jit.script -def compute_step_direction( +@get_torch_jit_decorator() +def jit_lbfgs_compute_step_direction( alpha_buffer, rho_buffer, y_buffer, @@ -35,6 +36,8 @@ def compute_step_direction( epsilon: float, stable_mode: bool = True, ): + grad_q = grad_q.transpose(-1, -2) + # m = 15 (int) # y_buffer, s_buffer: m x b x 175 # q, grad_q: b x 175 @@ -60,12 +63,39 @@ def compute_step_direction( return -1.0 * r +@get_torch_jit_decorator() +def jit_lbfgs_update_buffers( + q, grad_q, s_buffer, y_buffer, rho_buffer, x_0, grad_0, stable_mode: bool +): + grad_q = grad_q.transpose(-1, -2) + q = q.unsqueeze(-1) + + y = grad_q - grad_0 + s = q - x_0 + rho = 1 / (y.transpose(-1, -2) @ s) + if stable_mode: + rho = torch.nan_to_num(rho, 0.0, 0.0, 0.0) + s_buffer[0] = s + s_buffer[:] = torch.roll(s_buffer, -1, dims=0) + y_buffer[0] = y + y_buffer[:] = torch.roll(y_buffer, -1, dims=0) # .copy_(y_buff) + + rho_buffer[0] = rho + + rho_buffer[:] = torch.roll(rho_buffer, -1, dims=0) + + x_0.copy_(q) + grad_0.copy_(grad_q) + return s_buffer, y_buffer, rho_buffer, x_0, grad_0 + + @dataclass class LBFGSOptConfig(NewtonOptConfig): history: int = 10 epsilon: float = 0.01 use_cuda_kernel: bool = True stable_mode: bool = True + use_shared_buffers_kernel: bool = True def __post_init__(self): return super().__post_init__() @@ -77,11 +107,15 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig): if config is not None: LBFGSOptConfig.__init__(self, **vars(config)) NewtonOptBase.__init__(self) - if self.d_opt >= 1024 or self.history > 15: - log_warn("LBFGS: Not using LBFGS Cuda Kernel as d_opt>1024 or history>15") + if ( + self.d_opt >= 1024 + or self.history > 31 + or ((self.d_opt * self.history + 33) * 4 >= 48000) + ): + log_info("LBFGS: Not using LBFGS Cuda Kernel as d_opt>1024 or history>15") self.use_cuda_kernel = False - if self.history >= self.d_opt: - log_warn("LBFGS: history >= d_opt, reducing history to d_opt-1") + if self.history > self.d_opt: + log_info("LBFGS: history >= d_opt, reducing history to d_opt-1") self.history = self.d_opt - 1 @profiler.record_function("lbfgs/reset") @@ -130,7 +164,7 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig): def _get_step_direction(self, cost, q, grad_q): if self.use_cuda_kernel: with profiler.record_function("lbfgs/fused"): - dq = LBFGScu._call_cuda( + dq = LBFGScu.apply( self.step_q_buffer, self.rho_buffer, self.y_buffer, @@ -141,13 +175,14 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig): self.grad_0, self.epsilon, self.stable_mode, + self.use_shared_buffers_kernel, ) else: - grad_q = grad_q.transpose(-1, -2) - q = q.unsqueeze(-1) + self._update_buffers(q, grad_q) - dq = compute_step_direction( + + dq = jit_lbfgs_compute_step_direction( self.alpha_buffer, self.rho_buffer, self.y_buffer, @@ -177,6 +212,23 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig): return -1.0 * r def _update_buffers(self, q, grad_q): + if True: + self.s_buffer, self.y_buffer, self.rho_buffer, self.x_0, self.grad_0 = ( + jit_lbfgs_update_buffers( + q, + grad_q, + self.s_buffer, + self.y_buffer, + self.rho_buffer, + self.x_0, + self.grad_0, + self.stable_mode, + ) + ) + return + grad_q = grad_q.transpose(-1, -2) + q = q.unsqueeze(-1) + y = grad_q - self.grad_0 s = q - self.x_0 rho = 1 / (y.transpose(-1, -2) @ s) diff --git a/src/curobo/opt/newton/newton_base.py b/src/curobo/opt/newton/newton_base.py index 62eaf21..ea9061d 100644 --- a/src/curobo/opt/newton/newton_base.py +++ b/src/curobo/opt/newton/newton_base.py @@ -26,6 +26,7 @@ from curobo.opt.opt_base import Optimizer, OptimizerConfig from curobo.rollout.dynamics_model.integration_utils import build_fd_matrix from curobo.types.base import TensorDeviceType from curobo.types.tensor import T_BDOF, T_BHDOF_float, T_BHValue_float, T_BValue_float, T_HDOF_float +from curobo.util.torch_utils import get_torch_jit_decorator class LineSearchType(Enum): @@ -108,6 +109,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig): self.action_horizon, device=self.tensor_args.device, dtype=self.tensor_args.dtype ).unsqueeze(0) self._temporal_mat += eye_mat + self.rollout_fn.sum_horizon = True def reset_cuda_graph(self): if self.cu_opt_graph is not None: @@ -222,11 +224,14 @@ class NewtonOptBase(Optimizer, NewtonOptConfig): self.n_problems * self.num_particles, self.action_horizon, self.rollout_fn.d_action ) trajectories = self.rollout_fn(x_in) # x_n = (batch*line_search_scale) x horizon x d_action - cost = torch.sum( - trajectories.costs.view(self.n_problems, self.num_particles, self.horizon), - dim=-1, - keepdim=True, - ) + if len(trajectories.costs.shape) == 2: + cost = torch.sum( + trajectories.costs.view(self.n_problems, self.num_particles, self.horizon), + dim=-1, + keepdim=True, + ) + else: + cost = trajectories.costs.view(self.n_problems, self.num_particles, 1) g_x = cost.backward(gradient=self.l_vec, retain_graph=False) g_x = x_n.grad.detach() return ( @@ -542,7 +547,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig): ) -@torch.jit.script +@get_torch_jit_decorator() def get_x_set_jit(step_vec, x, alpha_list, action_lows, action_highs): # step_direction = step_direction.detach() x_set = torch.clamp(x.unsqueeze(-2) + alpha_list * step_vec, action_lows, action_highs) @@ -550,7 +555,7 @@ def get_x_set_jit(step_vec, x, alpha_list, action_lows, action_highs): return x_set -@torch.jit.script +@get_torch_jit_decorator() def _armijo_line_search_tail_jit(c, g_x, step_direction, c_1, alpha_list, c_idx, x_set, d_opt): c_0 = c[:, 0:1] g_0 = g_x[:, 0:1] @@ -581,7 +586,7 @@ def _armijo_line_search_tail_jit(c, g_x, step_direction, c_1, alpha_list, c_idx, return (best_x, best_c, best_grad) -@torch.jit.script +@get_torch_jit_decorator() def _wolfe_search_tail_jit(c, g_x, x_set, m, d_opt: int): b, h, _ = x_set.shape g_x = g_x.view(b * h, -1) @@ -593,7 +598,7 @@ def _wolfe_search_tail_jit(c, g_x, x_set, m, d_opt: int): return (best_x, best_c, best_grad) -@torch.jit.script +@get_torch_jit_decorator() def scale_action(dx, action_step_max): scale_value = torch.max(torch.abs(dx) / action_step_max, dim=-1, keepdim=True)[0] scale_value = torch.clamp(scale_value, 1.0) @@ -601,7 +606,7 @@ def scale_action(dx, action_step_max): return dx_scaled -@torch.jit.script +@get_torch_jit_decorator() def check_convergence( best_iteration: torch.Tensor, current_iteration: torch.Tensor, last_best: int ) -> bool: diff --git a/src/curobo/opt/particle/parallel_es.py b/src/curobo/opt/particle/parallel_es.py index e3d955b..f75f648 100644 --- a/src/curobo/opt/particle/parallel_es.py +++ b/src/curobo/opt/particle/parallel_es.py @@ -16,7 +16,15 @@ from typing import Optional import torch # CuRobo -from curobo.opt.particle.parallel_mppi import CovType, ParallelMPPI, ParallelMPPIConfig +from curobo.opt.particle.parallel_mppi import ( + CovType, + ParallelMPPI, + ParallelMPPIConfig, + Trajectory, + jit_blend_mean, +) +from curobo.opt.particle.particle_opt_base import SampleMode +from curobo.util.torch_utils import get_torch_jit_decorator @dataclass @@ -38,14 +46,72 @@ class ParallelES(ParallelMPPI, ParallelESConfig): ) # get the new means from here # use Evolutionary Strategy Mean here: + new_mean = jit_blend_mean(self.mean_action, new_mean, self.step_size_mean) return new_mean + def _exp_util_from_costs(self, costs): + total_costs = self._compute_total_cost(costs) + w = self._exp_util(total_costs) + return w + def _exp_util(self, total_costs): w = calc_exp(total_costs) return w + def _compute_mean_covariance(self, costs, actions): + w = self._exp_util_from_costs(costs) + w = w.unsqueeze(-1).unsqueeze(-1) + new_mean = self._compute_mean(w, actions) + new_cov = self._compute_covariance(w, actions) + self._update_cov_scale(new_cov) -@torch.jit.script + return new_mean, new_cov + + @torch.no_grad() + def _update_distribution(self, trajectories: Trajectory): + costs = trajectories.costs + actions = trajectories.actions + + # Let's reshape to n_problems now: + + # first find the means before doing exponential utility: + + # Update best action + if self.sample_mode == SampleMode.BEST: + w = self._exp_util_from_costs(costs) + + best_idx = torch.argmax(w, dim=-1) + self.best_traj.copy_(actions[self.problem_col, best_idx]) + + if self.store_rollouts and self.visual_traj is not None: + total_costs = self._compute_total_cost(costs) + vis_seq = getattr(trajectories.state, self.visual_traj) + top_values, top_idx = torch.topk(total_costs, 20, dim=1) + self.top_values = top_values + self.top_idx = top_idx + top_trajs = torch.index_select(vis_seq, 0, top_idx[0]) + for i in range(1, top_idx.shape[0]): + trajs = torch.index_select( + vis_seq, 0, top_idx[i] + (self.particles_per_problem * i) + ) + top_trajs = torch.cat((top_trajs, trajs), dim=0) + if self.top_trajs is None or top_trajs.shape != self.top_trajs: + self.top_trajs = top_trajs + else: + self.top_trajs.copy_(top_trajs) + + if not self.update_cov: + w = w.unsqueeze(-1).unsqueeze(-1) + + new_mean = self._compute_mean(w, actions) + else: + new_mean, new_cov = self._compute_mean_covariance(costs, actions) + self.cov_action.copy_(new_cov) + + self.mean_action.copy_(new_mean) + + +@get_torch_jit_decorator() def calc_exp( total_costs, ): @@ -58,7 +124,7 @@ def calc_exp( return w -@torch.jit.script +@get_torch_jit_decorator() def compute_es_mean( w, actions, mean_action, full_inv_cov, num_particles: int, learning_rate: float ): diff --git a/src/curobo/opt/particle/parallel_mppi.py b/src/curobo/opt/particle/parallel_mppi.py index 7d728da..c97ad98 100644 --- a/src/curobo/opt/particle/parallel_mppi.py +++ b/src/curobo/opt/particle/parallel_mppi.py @@ -33,6 +33,7 @@ from curobo.types.robot import State from curobo.util.logger import log_info from curobo.util.sample_lib import HaltonSampleLib, SampleConfig, SampleLib from curobo.util.tensor_util import copy_tensor +from curobo.util.torch_utils import get_torch_jit_decorator class BaseActionType(Enum): @@ -187,11 +188,43 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig): # w = torch.softmax((-1.0 / self.beta) * total_costs, dim=-1) return w + def _exp_util_from_costs(self, costs): + w = jit_calculate_exp_util_from_costs(costs, self.gamma_seq, self.beta) + return w + def _compute_mean(self, w, actions): # get the new means from here new_mean = torch.sum(w * actions, dim=-3) + new_mean = jit_blend_mean(self.mean_action, new_mean, self.step_size_mean) return new_mean + def _compute_mean_covariance(self, costs, actions): + if self.cov_type == CovType.FULL_A: + log_error("Not implemented") + if self.cov_type == CovType.DIAG_A: + new_mean, new_cov, new_scale_tril = jit_mean_cov_diag_a( + costs, + actions, + self.gamma_seq, + self.mean_action, + self.cov_action, + self.step_size_mean, + self.step_size_cov, + self.kappa, + self.beta, + ) + self.scale_tril.copy_(new_scale_tril) + # self._update_cov_scale(new_cov) + + else: + w = self._exp_util_from_costs(costs) + w = w.unsqueeze(-1).unsqueeze(-1) + new_mean = self._compute_mean(w, actions) + new_cov = self._compute_covariance(w, actions) + self._update_cov_scale(new_cov) + + return new_mean, new_cov + def _compute_covariance(self, w, actions): if not self.update_cov: return @@ -200,27 +233,13 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig): if self.cov_type == CovType.SIGMA_I: delta_actions = actions - self.mean_action.unsqueeze(-3) - # weighted_delta = w * (delta ** 2).T - # cov_update = torch.ean(torch.sum(weighted_delta.T, dim=0)) - # print(cov_update.shape, self.cov_action) weighted_delta = w * (delta_actions**2) cov_update = torch.mean( torch.sum(torch.sum(weighted_delta, dim=-2), dim=-1), dim=-1, keepdim=True ) - # raise NotImplementedError("Need to implement covariance update of form sigma*I") elif self.cov_type == CovType.DIAG_A: - # Diagonal covariance of size AxA - # n, b, h, d = delta_actions.shape - # delta_actions = delta_actions.view(n,b,h*d) - # weighted_delta = w * (delta_actions**2) - # weighted_delta = - # sum across horizon and mean across particles: - # cov_update = torch.diag(torch.mean(torch.sum(weighted_delta.T , dim=0), dim=0)) - # cov_update = torch.mean(torch.sum(weighted_delta, dim=-2), dim=-2).unsqueeze( - # -2 - # ) # .expand(-1,-1,-1) cov_update = jit_diag_a_cov_update(w, actions, self.mean_action) elif self.cov_type == CovType.FULL_A: @@ -241,19 +260,22 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig): else: raise ValueError("Unidentified covariance type in update_distribution") + cov_update = jit_blend_cov(self.cov_action, cov_update, self.step_size_cov, self.kappa) return cov_update - def _update_cov_scale(self): + def _update_cov_scale(self, new_cov=None): + if new_cov is None: + new_cov = self.cov_action if not self.update_cov: return if self.cov_type == CovType.SIGMA_I: - self.scale_tril = torch.sqrt(self.cov_action) + self.scale_tril = torch.sqrt(new_cov) elif self.cov_type == CovType.DIAG_A: - self.scale_tril.copy_(torch.sqrt(self.cov_action)) + self.scale_tril.copy_(torch.sqrt(new_cov)) elif self.cov_type == CovType.FULL_A: - self.scale_tril = matrix_cholesky(self.cov_action) + self.scale_tril = matrix_cholesky(new_cov) elif self.cov_type == CovType.FULL_HA: raise NotImplementedError @@ -263,44 +285,44 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig): costs = trajectories.costs actions = trajectories.actions - total_costs = self._compute_total_cost(costs) # Let's reshape to n_problems now: # first find the means before doing exponential utility: - w = self._exp_util(total_costs) - # Update best action - if self.sample_mode == SampleMode.BEST: - best_idx = torch.argmax(w, dim=-1) - self.best_traj.copy_(actions[self.problem_col, best_idx]) + with profiler.record_function("mppi/get_best"): - if self.store_rollouts and self.visual_traj is not None: - vis_seq = getattr(trajectories.state, self.visual_traj) - top_values, top_idx = torch.topk(total_costs, 20, dim=1) - self.top_values = top_values - self.top_idx = top_idx - top_trajs = torch.index_select(vis_seq, 0, top_idx[0]) - for i in range(1, top_idx.shape[0]): - trajs = torch.index_select( - vis_seq, 0, top_idx[i] + (self.particles_per_problem * i) - ) - top_trajs = torch.cat((top_trajs, trajs), dim=0) - if self.top_trajs is None or top_trajs.shape != self.top_trajs: - self.top_trajs = top_trajs - else: - self.top_trajs.copy_(top_trajs) + # Update best action + if self.sample_mode == SampleMode.BEST: + w = self._exp_util_from_costs(costs) + best_idx = torch.argmax(w, dim=-1) + self.best_traj.copy_(actions[self.problem_col, best_idx]) + with profiler.record_function("mppi/store_rollouts"): - w = w.unsqueeze(-1).unsqueeze(-1) + if self.store_rollouts and self.visual_traj is not None: + total_costs = self._compute_total_cost(costs) + vis_seq = getattr(trajectories.state, self.visual_traj) + top_values, top_idx = torch.topk(total_costs, 20, dim=1) + self.top_values = top_values + self.top_idx = top_idx + top_trajs = torch.index_select(vis_seq, 0, top_idx[0]) + for i in range(1, top_idx.shape[0]): + trajs = torch.index_select( + vis_seq, 0, top_idx[i] + (self.particles_per_problem * i) + ) + top_trajs = torch.cat((top_trajs, trajs), dim=0) + if self.top_trajs is None or top_trajs.shape != self.top_trajs: + self.top_trajs = top_trajs + else: + self.top_trajs.copy_(top_trajs) - new_mean = self._compute_mean(w, actions) - # print(new_mean) - if self.update_cov: - cov_update = self._compute_covariance(w, actions) - new_cov = jit_blend_cov(self.cov_action, cov_update, self.step_size_cov, self.kappa) + if not self.update_cov: + w = self._exp_util_from_costs(costs) + w = w.unsqueeze(-1).unsqueeze(-1) + new_mean = self._compute_mean(w, actions) + else: + new_mean, new_cov = self._compute_mean_covariance(costs, actions) self.cov_action.copy_(new_cov) - self._update_cov_scale() - new_mean = jit_blend_mean(self.mean_action, new_mean, self.step_size_mean) self.mean_action.copy_(new_mean) @torch.no_grad() @@ -591,20 +613,28 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig): return super().generate_rollouts(init_act) -@torch.jit.script +@get_torch_jit_decorator() def jit_calculate_exp_util(beta: float, total_costs): w = torch.softmax((-1.0 / beta) * total_costs, dim=-1) return w -@torch.jit.script +@get_torch_jit_decorator() +def jit_calculate_exp_util_from_costs(costs, gamma_seq, beta: float): + cost_seq = gamma_seq * costs + cost_seq = torch.sum(cost_seq, dim=-1, keepdim=False) / gamma_seq[..., 0] + w = torch.softmax((-1.0 / beta) * cost_seq, dim=-1) + return w + + +@get_torch_jit_decorator() def jit_compute_total_cost(gamma_seq, costs): cost_seq = gamma_seq * costs cost_seq = torch.sum(cost_seq, dim=-1, keepdim=False) / gamma_seq[..., 0] return cost_seq -@torch.jit.script +@get_torch_jit_decorator() def jit_diag_a_cov_update(w, actions, mean_action): delta_actions = actions - mean_action.unsqueeze(-3) @@ -616,13 +646,35 @@ def jit_diag_a_cov_update(w, actions, mean_action): return cov_update -@torch.jit.script +@get_torch_jit_decorator() def jit_blend_cov(cov_action, cov_update, step_size_cov: float, kappa: float): new_cov = (1.0 - step_size_cov) * cov_action + step_size_cov * cov_update + kappa return new_cov -@torch.jit.script +@get_torch_jit_decorator() def jit_blend_mean(mean_action, new_mean, step_size_mean: float): mean_update = (1.0 - step_size_mean) * mean_action + step_size_mean * new_mean return mean_update + + +@get_torch_jit_decorator() +def jit_mean_cov_diag_a( + costs, + actions, + gamma_seq, + mean_action, + cov_action, + step_size_mean: float, + step_size_cov: float, + kappa: float, + beta: float, +): + w = jit_calculate_exp_util_from_costs(costs, gamma_seq, beta) + w = w.unsqueeze(-1).unsqueeze(-1) + new_mean = torch.sum(w * actions, dim=-3) + new_mean = jit_blend_mean(mean_action, new_mean, step_size_mean) + cov_update = jit_diag_a_cov_update(w, actions, mean_action) + new_cov = jit_blend_cov(cov_action, cov_update, step_size_cov, kappa) + new_tril = torch.sqrt(new_cov) + return new_mean, new_cov, new_tril diff --git a/src/curobo/opt/particle/particle_opt_base.py b/src/curobo/opt/particle/particle_opt_base.py index ffb849f..d3da069 100644 --- a/src/curobo/opt/particle/particle_opt_base.py +++ b/src/curobo/opt/particle/particle_opt_base.py @@ -263,7 +263,7 @@ class ParticleOptBase(Optimizer, ParticleOptConfig): trajectory.costs = trajectory.costs.view( self.n_problems, self.particles_per_problem, self.horizon ) - with profiler.record_function("mpc/mppi/update_distribution"): + with profiler.record_function("mppi/update_distribution"): self._update_distribution(trajectory) if not self.use_cuda_graph and self.store_debug: self.debug.append(self._get_action_seq(mode=self.sample_mode).clone()) diff --git a/src/curobo/opt/particle/particle_opt_utils.py b/src/curobo/opt/particle/particle_opt_utils.py index d915e81..2c0bf07 100644 --- a/src/curobo/opt/particle/particle_opt_utils.py +++ b/src/curobo/opt/particle/particle_opt_utils.py @@ -18,6 +18,7 @@ import torch.autograd.profiler as profiler # CuRobo from curobo.types.base import TensorDeviceType +from curobo.util.torch_utils import get_torch_jit_decorator class SquashType(Enum): @@ -74,7 +75,7 @@ def get_stomp_cov( return cov, scale_tril -@torch.jit.script +@get_torch_jit_decorator() def get_stomp_cov_jit( horizon: int, d_action: int, @@ -245,7 +246,7 @@ def gaussian_kl(mean0, cov0, mean1, cov1, cov_type="full"): return term1 + mahalanobis_dist + term3 -# @torch.jit.script +# @get_torch_jit_decorator() def cost_to_go(cost_seq, gamma_seq, only_first=False): # type: (Tensor, Tensor, bool) -> Tensor diff --git a/src/curobo/rollout/arm_base.py b/src/curobo/rollout/arm_base.py index 603754b..f98279f 100644 --- a/src/curobo/rollout/arm_base.py +++ b/src/curobo/rollout/arm_base.py @@ -40,7 +40,7 @@ from curobo.types.base import TensorDeviceType from curobo.types.robot import CSpaceConfig, RobotConfig from curobo.types.state import JointState from curobo.util.logger import log_error, log_info, log_warn -from curobo.util.tensor_util import cat_sum +from curobo.util.tensor_util import cat_sum, cat_sum_horizon @dataclass @@ -104,10 +104,10 @@ class ArmCostConfig: @dataclass class ArmBaseConfig(RolloutConfig): - model_cfg: KinematicModelConfig - cost_cfg: ArmCostConfig - constraint_cfg: ArmCostConfig - convergence_cfg: ArmCostConfig + model_cfg: Optional[KinematicModelConfig] = None + cost_cfg: Optional[ArmCostConfig] = None + constraint_cfg: Optional[ArmCostConfig] = None + convergence_cfg: Optional[ArmCostConfig] = None world_coll_checker: Optional[WorldCollision] = None @staticmethod @@ -322,7 +322,9 @@ class ArmBase(RolloutBase, ArmBaseConfig): self.null_convergence = DistCost(self.convergence_cfg.null_space_cfg) # set start state: - start_state = torch.randn((1, self.dynamics_model.d_state), **vars(self.tensor_args)) + start_state = torch.randn( + (1, self.dynamics_model.d_state), **(self.tensor_args.as_torch_dict()) + ) self._start_state = JointState( position=start_state[:, : self.dynamics_model.d_dof], velocity=start_state[:, : self.dynamics_model.d_dof], @@ -366,9 +368,11 @@ class ArmBase(RolloutBase, ArmBaseConfig): ) cost_list.append(coll_cost) if return_list: - return cost_list - cost = cat_sum(cost_list) + if self.sum_horizon: + cost = cat_sum_horizon(cost_list) + else: + cost = cat_sum(cost_list) return cost def constraint_fn( diff --git a/src/curobo/rollout/arm_reacher.py b/src/curobo/rollout/arm_reacher.py index 8074d11..095e89c 100644 --- a/src/curobo/rollout/arm_reacher.py +++ b/src/curobo/rollout/arm_reacher.py @@ -10,7 +10,7 @@ # # Standard Library from dataclasses import dataclass -from typing import Dict, Optional +from typing import Dict, List, Optional # Third Party import torch @@ -29,8 +29,9 @@ from curobo.types.base import TensorDeviceType from curobo.types.robot import RobotConfig from curobo.types.tensor import T_BValue_float, T_BValue_int from curobo.util.helpers import list_idx_if_not_none -from curobo.util.logger import log_error, log_info -from curobo.util.tensor_util import cat_max, cat_sum +from curobo.util.logger import log_error, log_info, log_warn +from curobo.util.tensor_util import cat_max +from curobo.util.torch_utils import get_torch_jit_decorator # Local Folder from .arm_base import ArmBase, ArmBaseConfig, ArmCostConfig @@ -145,7 +146,7 @@ class ArmReacherConfig(ArmBaseConfig): ) -@torch.jit.script +@get_torch_jit_decorator() def _compute_g_dist_jit(rot_err_norm, goal_dist): # goal_cost = goal_cost.view(cost.shape) # rot_err_norm = rot_err_norm.view(cost.shape) @@ -319,7 +320,12 @@ class ArmReacher(ArmBase, ArmReacherConfig): g_dist, ) cost_list.append(z_vel) - cost = cat_sum(cost_list) + with profiler.record_function("cat_sum"): + if self.sum_horizon: + cost = cat_sum_horizon_reacher(cost_list) + else: + cost = cat_sum_reacher(cost_list) + return cost def convergence_fn( @@ -466,3 +472,15 @@ class ArmReacher(ArmBase, ArmReacherConfig): ) for x in pose_costs ] + + +@get_torch_jit_decorator() +def cat_sum_reacher(tensor_list: List[torch.Tensor]): + cat_tensor = torch.sum(torch.stack(tensor_list, dim=0), dim=0) + return cat_tensor + + +@get_torch_jit_decorator() +def cat_sum_horizon_reacher(tensor_list: List[torch.Tensor]): + cat_tensor = torch.sum(torch.stack(tensor_list, dim=0), dim=(0, -1)) + return cat_tensor diff --git a/src/curobo/rollout/cost/bound_cost.py b/src/curobo/rollout/cost/bound_cost.py index ebaf8f5..0c70194 100644 --- a/src/curobo/rollout/cost/bound_cost.py +++ b/src/curobo/rollout/cost/bound_cost.py @@ -21,6 +21,7 @@ import warp as wp from curobo.cuda_robot_model.types import JointLimits from curobo.types.robot import JointState from curobo.types.tensor import T_DOF +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util.warp import init_warp # Local Folder @@ -267,7 +268,7 @@ class BoundCost(CostBase, BoundCostConfig): return super().update_dt(dt) -@torch.jit.script +@get_torch_jit_decorator() def forward_bound_cost(p, lower_bounds, upper_bounds, weight): # c = weight * torch.sum(torch.nn.functional.relu(torch.max(lower_bounds - p, p - upper_bounds)), dim=-1) @@ -281,7 +282,7 @@ def forward_bound_cost(p, lower_bounds, upper_bounds, weight): return c -@torch.jit.script +@get_torch_jit_decorator() def forward_all_bound_cost( p, v, diff --git a/src/curobo/rollout/cost/dist_cost.py b/src/curobo/rollout/cost/dist_cost.py index 3c387ad..58d9f4c 100644 --- a/src/curobo/rollout/cost/dist_cost.py +++ b/src/curobo/rollout/cost/dist_cost.py @@ -18,6 +18,7 @@ import torch import warp as wp # CuRobo +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util.warp import init_warp # Local Folder @@ -41,32 +42,32 @@ class DistCostConfig(CostConfig): return super().__post_init__() -@torch.jit.script +@get_torch_jit_decorator() def L2_DistCost_jit(vec_weight, disp_vec): return torch.norm(vec_weight * disp_vec, p=2, dim=-1, keepdim=False) -@torch.jit.script +@get_torch_jit_decorator() def fwd_SQL2_DistCost_jit(vec_weight, disp_vec): return torch.sum(torch.square(vec_weight * disp_vec), dim=-1, keepdim=False) -@torch.jit.script +@get_torch_jit_decorator() def fwd_L1_DistCost_jit(vec_weight, disp_vec): return torch.sum(torch.abs(vec_weight * disp_vec), dim=-1, keepdim=False) -@torch.jit.script +@get_torch_jit_decorator() def L2_DistCost_target_jit(vec_weight, g_vec, c_vec, weight): return torch.norm(weight * vec_weight * (g_vec - c_vec), p=2, dim=-1, keepdim=False) -@torch.jit.script +@get_torch_jit_decorator() def fwd_SQL2_DistCost_target_jit(vec_weight, g_vec, c_vec, weight): return torch.sum(torch.square(weight * vec_weight * (g_vec - c_vec)), dim=-1, keepdim=False) -@torch.jit.script +@get_torch_jit_decorator() def fwd_L1_DistCost_target_jit(vec_weight, g_vec, c_vec, weight): return torch.sum(torch.abs(weight * vec_weight * (g_vec - c_vec)), dim=-1, keepdim=False) diff --git a/src/curobo/rollout/cost/primitive_collision_cost.py b/src/curobo/rollout/cost/primitive_collision_cost.py index 7f9f6f2..a840114 100644 --- a/src/curobo/rollout/cost/primitive_collision_cost.py +++ b/src/curobo/rollout/cost/primitive_collision_cost.py @@ -19,6 +19,8 @@ import torch from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollision from curobo.rollout.cost.cost_base import CostBase, CostConfig from curobo.rollout.dynamics_model.integration_utils import interpolate_kernel, sum_matrix +from curobo.util.logger import log_info +from curobo.util.torch_utils import get_torch_jit_decorator @dataclass @@ -48,7 +50,11 @@ class PrimitiveCollisionCostConfig(CostConfig): #: post optimization interpolation to not hit any obstacles. activation_distance: Union[torch.Tensor, float] = 0.0 + #: Setting this flag to true will sum the distance colliding obstacles. + sum_collisions: bool = True + #: Setting this flag to true will sum the distance across spheres of the robot. + #: Setting to False will only take the max distance sum_distance: bool = True def __post_init__(self): @@ -103,6 +109,9 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): self._collision_query_buffer.update_buffer_shape( robot_spheres_in.shape, self.tensor_args, self.world_coll_checker.collision_types ) + if not self.sum_distance: + log_info("sum_distance=False will be slower than sum_distance=True") + self.return_loss = True dist = self.sweep_check_fn( robot_spheres_in, self._collision_query_buffer, @@ -115,9 +124,9 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): return_loss=self.return_loss, ) if self.classify: - cost = weight_collision(dist, self.weight, self.sum_distance) + cost = weight_collision(dist, self.sum_distance) else: - cost = weight_distance(dist, self.weight, self.sum_distance) + cost = weight_distance(dist, self.sum_distance) return cost def sweep_fn(self, robot_spheres_in, env_query_idx: Optional[torch.Tensor] = None): @@ -140,6 +149,9 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): self._collision_query_buffer.update_buffer_shape( sampled_spheres.shape, self.tensor_args, self.world_coll_checker.collision_types ) + if not self.sum_distance: + log_info("sum_distance=False will be slower than sum_distance=True") + self.return_loss = True dist = self.coll_check_fn( sampled_spheres.contiguous(), self._collision_query_buffer, @@ -151,9 +163,9 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): dist = dist.view(batch_size, new_horizon, n_spheres) if self.classify: - cost = weight_sweep_collision(self.int_sum_mat, dist, self.weight, self.sum_distance) + cost = weight_sweep_collision(self.int_sum_mat, dist, self.sum_distance) else: - cost = weight_sweep_distance(self.int_sum_mat, dist, self.weight, self.sum_distance) + cost = weight_sweep_distance(self.int_sum_mat, dist, self.sum_distance) return cost @@ -161,6 +173,9 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): self._collision_query_buffer.update_buffer_shape( robot_spheres_in.shape, self.tensor_args, self.world_coll_checker.collision_types ) + if not self.sum_distance: + log_info("sum_distance=False will be slower than sum_distance=True") + self.return_loss = True dist = self.coll_check_fn( robot_spheres_in, self._collision_query_buffer, @@ -168,12 +183,13 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): env_query_idx=env_query_idx, activation_distance=self.activation_distance, return_loss=self.return_loss, + sum_collisions=self.sum_collisions, ) if self.classify: - cost = weight_collision(dist, self.weight, self.sum_distance) + cost = weight_collision(dist, self.sum_distance) else: - cost = weight_distance(dist, self.weight, self.sum_distance) + cost = weight_distance(dist, self.sum_distance) return cost def update_dt(self, dt: Union[float, torch.Tensor]): @@ -184,31 +200,43 @@ class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig): return self._collision_query_buffer.get_gradient_buffer() -@torch.jit.script -def weight_sweep_distance(int_mat, dist, weight, sum_cost: bool): - dist = torch.sum(dist, dim=-1) +@get_torch_jit_decorator() +def weight_sweep_distance(int_mat, dist, sum_cost: bool): + if sum_cost: + dist = torch.sum(dist, dim=-1) + else: + dist = torch.max(dist, dim=-1)[0] dist = dist @ int_mat return dist -@torch.jit.script -def weight_sweep_collision(int_mat, dist, weight, sum_cost: bool): - dist = torch.sum(dist, dim=-1) +@get_torch_jit_decorator() +def weight_sweep_collision(int_mat, dist, sum_cost: bool): + if sum_cost: + dist = torch.sum(dist, dim=-1) + else: + dist = torch.max(dist, dim=-1)[0] + dist = torch.where(dist > 0, dist + 1.0, dist) dist = dist @ int_mat return dist -@torch.jit.script -def weight_distance(dist, weight, sum_cost: bool): +@get_torch_jit_decorator() +def weight_distance(dist, sum_cost: bool): if sum_cost: dist = torch.sum(dist, dim=-1) + else: + dist = torch.max(dist, dim=-1)[0] return dist -@torch.jit.script -def weight_collision(dist, weight, sum_cost: bool): +@get_torch_jit_decorator() +def weight_collision(dist, sum_cost: bool): if sum_cost: dist = torch.sum(dist, dim=-1) + else: + dist = torch.max(dist, dim=-1)[0] + dist = torch.where(dist > 0, dist + 1.0, dist) return dist diff --git a/src/curobo/rollout/cost/stop_cost.py b/src/curobo/rollout/cost/stop_cost.py index 9a1cbbb..277444e 100644 --- a/src/curobo/rollout/cost/stop_cost.py +++ b/src/curobo/rollout/cost/stop_cost.py @@ -17,6 +17,7 @@ import torch # CuRobo from curobo.rollout.dynamics_model.kinematic_model import TimeTrajConfig +from curobo.util.torch_utils import get_torch_jit_decorator # Local Folder from .cost_base import CostBase, CostConfig @@ -72,7 +73,7 @@ class StopCost(CostBase, StopCostConfig): return cost -@torch.jit.script +@get_torch_jit_decorator() def velocity_cost(vels, weight, max_vel): vel_abs = torch.abs(vels) vel_abs = torch.nn.functional.relu(vel_abs - max_vel[: vels.shape[1]]) diff --git a/src/curobo/rollout/cost/straight_line_cost.py b/src/curobo/rollout/cost/straight_line_cost.py index 15abe17..870f4ab 100644 --- a/src/curobo/rollout/cost/straight_line_cost.py +++ b/src/curobo/rollout/cost/straight_line_cost.py @@ -13,11 +13,14 @@ # Third Party import torch +# CuRobo +from curobo.util.torch_utils import get_torch_jit_decorator + # Local Folder from .cost_base import CostBase, CostConfig -@torch.jit.script +@get_torch_jit_decorator() def st_cost(ee_pos_batch, vec_weight, weight): ee_plus_one = torch.roll(ee_pos_batch, 1, dims=1) diff --git a/src/curobo/rollout/cost/zero_cost.py b/src/curobo/rollout/cost/zero_cost.py index c122853..c370566 100644 --- a/src/curobo/rollout/cost/zero_cost.py +++ b/src/curobo/rollout/cost/zero_cost.py @@ -11,11 +11,14 @@ # Third Party import torch +# CuRobo +from curobo.util.torch_utils import get_torch_jit_decorator + # Local Folder from .cost_base import CostBase -@torch.jit.script +@get_torch_jit_decorator() def squared_sum(cost: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: # return weight * torch.square(torch.linalg.norm(cost, dim=-1, ord=1)) # return weight * torch.sum(torch.square(cost), dim=-1) @@ -24,7 +27,7 @@ def squared_sum(cost: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: return torch.sum(torch.square(cost) * weight, dim=-1) -@torch.jit.script +@get_torch_jit_decorator() def run_squared_sum( cost: torch.Tensor, weight: torch.Tensor, run_weight: torch.Tensor ) -> torch.Tensor: @@ -35,13 +38,13 @@ def run_squared_sum( # return torch.sum(torch.square(cost), dim=-1) * weight * run_weight -@torch.jit.script +@get_torch_jit_decorator() def backward_squared_sum(cost_vec, w): return 2.0 * w * cost_vec # * g_out.unsqueeze(-1) # return w * g_out.unsqueeze(-1) -@torch.jit.script +@get_torch_jit_decorator() def backward_run_squared_sum(cost_vec, w, r_w): return 2.0 * w * r_w.unsqueeze(-1) * cost_vec # * g_out.unsqueeze(-1) # return w * r_w.unsqueeze(-1) * cost_vec * g_out.unsqueeze(-1) diff --git a/src/curobo/rollout/dynamics_model/integration_utils.py b/src/curobo/rollout/dynamics_model/integration_utils.py index 143ac17..fb9d434 100644 --- a/src/curobo/rollout/dynamics_model/integration_utils.py +++ b/src/curobo/rollout/dynamics_model/integration_utils.py @@ -25,6 +25,7 @@ from curobo.curobolib.tensor_step import ( ) from curobo.types.base import TensorDeviceType from curobo.types.robot import JointState +from curobo.util.torch_utils import get_torch_jit_decorator def build_clique_matrix(horizon, dt, device="cpu", dtype=torch.float32): @@ -154,7 +155,7 @@ def build_start_state_mask(horizon, tensor_args: TensorDeviceType): return mask, n_mask -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_jerk(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_matrix=None): # type: (Tensor, Tensor, Tensor, Tensor, int, Tensor, Optional[Tensor]) -> Tensor @@ -176,7 +177,7 @@ def tensor_step_jerk(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_m return state_seq -# @torch.jit.script +# @get_torch_jit_decorator() def euler_integrate(q_0, u, diag_dt, integrate_matrix): # q_new = q_0 + torch.matmul(integrate_matrix, torch.matmul(diag_dt, u)) q_new = q_0 + torch.matmul(integrate_matrix, u) @@ -184,7 +185,7 @@ def euler_integrate(q_0, u, diag_dt, integrate_matrix): return q_new -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_acc(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_matrix=None): # type: (Tensor, Tensor, Tensor, Tensor, int, Tensor, Optional[Tensor]) -> Tensor # This is batch,n_dof @@ -207,7 +208,7 @@ def tensor_step_acc(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_ma return state_seq -@torch.jit.script +@get_torch_jit_decorator() def jit_tensor_step_pos_clique_contiguous(pos_act, start_position, mask, n_mask, fd_1, fd_2, fd_3): state_position = (start_position.unsqueeze(1).transpose(1, 2) @ mask.transpose(0, 1)) + ( pos_act.transpose(1, 2) @ n_mask.transpose(0, 1) @@ -222,7 +223,7 @@ def jit_tensor_step_pos_clique_contiguous(pos_act, start_position, mask, n_mask, return state_position, state_vel, state_acc, state_jerk -@torch.jit.script +@get_torch_jit_decorator() def jit_tensor_step_pos_clique(pos_act, start_position, mask, n_mask, fd_1, fd_2, fd_3): state_position = mask @ start_position.unsqueeze(1) + n_mask @ pos_act state_vel = fd_1 @ state_position @@ -231,7 +232,7 @@ def jit_tensor_step_pos_clique(pos_act, start_position, mask, n_mask, fd_1, fd_2 return state_position, state_vel, state_acc, state_jerk -@torch.jit.script +@get_torch_jit_decorator() def jit_backward_pos_clique(grad_p, grad_v, grad_a, grad_j, n_mask, fd_1, fd_2, fd_3): p_grad = ( grad_p @@ -247,7 +248,7 @@ def jit_backward_pos_clique(grad_p, grad_v, grad_a, grad_j, n_mask, fd_1, fd_2, return u_grad -@torch.jit.script +@get_torch_jit_decorator() def jit_backward_pos_clique_contiguous(grad_p, grad_v, grad_a, grad_j, n_mask, fd_1, fd_2, fd_3): p_grad = grad_p + ( grad_j.transpose(-1, -2) @ fd_3 @@ -532,7 +533,7 @@ class CliqueTensorStepIdxCentralDifferenceKernel(torch.autograd.Function): start_position, start_velocity, start_acceleration, - start_idx, + start_idx.contiguous(), traj_dt, out_position.shape[0], out_position.shape[1], @@ -750,7 +751,7 @@ class AccelerationTensorStepIdxKernel(torch.autograd.Function): return u_grad, None, None, None, None, None, None, None, None, None, None -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_pos_clique( state: JointState, act: torch.Tensor, @@ -786,7 +787,7 @@ def step_acc_semi_euler(state, act, diag_dt, n_dofs, integrate_matrix): return state_seq -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_acc_semi_euler( state, act, state_seq, diag_dt, integrate_matrix, integrate_matrix_pos ): @@ -806,7 +807,7 @@ def tensor_step_acc_semi_euler( return state_seq -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_vel(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_matrix): # type: (Tensor, Tensor, Tensor, Tensor, int, Tensor, Tensor) -> Tensor @@ -830,7 +831,7 @@ def tensor_step_vel(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_ma return state_seq -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_pos(state, act, state_seq, fd_matrix): # This is batch,n_dof state_seq.position[:, 0, :] = state.position @@ -850,7 +851,7 @@ def tensor_step_pos(state, act, state_seq, fd_matrix): return state_seq -# @torch.jit.script +# @get_torch_jit_decorator() def tensor_step_pos_ik(act, state_seq): state_seq.position = act return state_seq @@ -869,7 +870,7 @@ def tensor_linspace(start_tensor, end_tensor, steps=10): def sum_matrix(h, int_steps, tensor_args): - sum_mat = torch.zeros(((h - 1) * int_steps, h), **vars(tensor_args)) + sum_mat = torch.zeros(((h - 1) * int_steps, h), **(tensor_args.as_torch_dict())) for i in range(h - 1): sum_mat[i * int_steps : i * int_steps + int_steps, i] = 1.0 # hack: diff --git a/src/curobo/rollout/dynamics_model/tensor_step.py b/src/curobo/rollout/dynamics_model/tensor_step.py index cccc049..c28b06b 100644 --- a/src/curobo/rollout/dynamics_model/tensor_step.py +++ b/src/curobo/rollout/dynamics_model/tensor_step.py @@ -19,6 +19,7 @@ import torch # CuRobo from curobo.types.base import TensorDeviceType from curobo.types.robot import JointState +from curobo.util.torch_utils import get_torch_jit_decorator # Local Folder from .integration_utils import ( @@ -544,7 +545,7 @@ class TensorStepPositionCliqueKernel(TensorStepBase): return new_signal -@torch.jit.script +@get_torch_jit_decorator(force_jit=True) def filter_signal_jit(signal, kernel): b, h, dof = signal.shape diff --git a/src/curobo/rollout/rollout_base.py b/src/curobo/rollout/rollout_base.py index 31cdf9f..2b40cc1 100644 --- a/src/curobo/rollout/rollout_base.py +++ b/src/curobo/rollout/rollout_base.py @@ -36,6 +36,7 @@ from curobo.util.helpers import list_idx_if_not_none from curobo.util.logger import log_info from curobo.util.sample_lib import HaltonGenerator from curobo.util.tensor_util import copy_tensor +from curobo.util.torch_utils import get_torch_jit_decorator @dataclass @@ -298,9 +299,9 @@ class Goal(Sequence): if self.goal_pose is not None: self.goal_pose = self.goal_pose.to(tensor_args) if self.goal_state is not None: - self.goal_state = self.goal_state.to(**vars(tensor_args)) + self.goal_state = self.goal_state.to(**(tensor_args.as_torch_dict())) if self.current_state is not None: - self.current_state = self.current_state.to(**vars(tensor_args)) + self.current_state = self.current_state.to(**(tensor_args.as_torch_dict())) return self def copy_(self, goal: Goal, update_idx_buffers: bool = True): @@ -350,6 +351,7 @@ class Goal(Sequence): if ref_buffer is not None: ref_buffer = ref_buffer.copy_(buffer) else: + log_info("breaking reference") ref_buffer = buffer.clone() return ref_buffer @@ -414,6 +416,7 @@ class Goal(Sequence): @dataclass class RolloutConfig: tensor_args: TensorDeviceType + sum_horizon: bool = False class RolloutBase: @@ -578,7 +581,7 @@ class RolloutBase: return q_js -@torch.jit.script +@get_torch_jit_decorator() def tensor_repeat_seeds(tensor, num_seeds: int): a = ( tensor.view(tensor.shape[0], 1, tensor.shape[-1]) diff --git a/src/curobo/types/base.py b/src/curobo/types/base.py index 54bb125..6669c62 100644 --- a/src/curobo/types/base.py +++ b/src/curobo/types/base.py @@ -19,7 +19,10 @@ import torch @dataclass(frozen=True) class TensorDeviceType: device: torch.device = torch.device("cuda", 0) - dtype: torch.float32 = torch.float32 + dtype: torch.dtype = torch.float32 + collision_geometry_dtype: torch.dtype = torch.float32 + collision_gradient_dtype: torch.dtype = torch.float32 + collision_distance_dtype: torch.dtype = torch.float32 @staticmethod def from_basic(device: str, dev_id: int): @@ -36,3 +39,6 @@ class TensorDeviceType: def cpu(self): return TensorDeviceType(device=torch.device("cpu"), dtype=self.dtype) + + def as_torch_dict(self): + return {"device": self.device, "dtype": self.dtype} diff --git a/src/curobo/types/camera.py b/src/curobo/types/camera.py index 0febe84..05c5ea2 100644 --- a/src/curobo/types/camera.py +++ b/src/curobo/types/camera.py @@ -39,7 +39,7 @@ class CameraObservation: resolution: Optional[List[int]] = None pose: Optional[Pose] = None intrinsics: Optional[torch.Tensor] = None - timestamp: float = 0.0 + timestamp: Optional[torch.Tensor] = None def filter_depth(self, distance: float = 0.01): self.depth_image = torch.where(self.depth_image < distance, 0, self.depth_image) @@ -62,6 +62,8 @@ class CameraObservation: self.projection_rays.copy_(new_data.projection_rays) if self.pose is not None: self.pose.copy_(new_data.pose) + if self.timestamp is not None: + self.timestamp.copy_(new_data.timestamp) self.resolution = new_data.resolution @record_function("camera/clone") @@ -73,13 +75,41 @@ class CameraObservation: intrinsics=self.intrinsics.clone() if self.intrinsics is not None else None, resolution=self.resolution, pose=self.pose.clone() if self.pose is not None else None, + timestamp=self.timestamp.clone() if self.timestamp is not None else None, + image_segmentation=( + self.image_segmentation.clone() if self.image_segmentation is not None else None + ), + projection_matrix=( + self.projection_matrix.clone() if self.projection_matrix is not None else None + ), + projection_rays=( + self.projection_rays.clone() if self.projection_rays is not None else None + ), + name=self.name, ) def to(self, device: torch.device): - if self.rgb_image is not None: - self.rgb_image = self.rgb_image.to(device=device) - if self.depth_image is not None: - self.depth_image = self.depth_image.to(device=device) + + self.rgb_image = self.rgb_image.to(device=device) if self.rgb_image is not None else None + self.depth_image = ( + self.depth_image.to(device=device) if self.depth_image is not None else None + ) + + self.image_segmentation = ( + self.image_segmentation.to(device=device) + if self.image_segmentation is not None + else None + ) + self.projection_matrix = ( + self.projection_matrix.to(device=device) if self.projection_matrix is not None else None + ) + self.projection_rays = ( + self.projection_rays.to(device=device) if self.projection_rays is not None else None + ) + self.intrinsics = self.intrinsics.to(device=device) if self.intrinsics is not None else None + self.timestamp = self.timestamp.to(device=device) if self.timestamp is not None else None + self.pose = self.pose.to(device=device) if self.pose is not None else None + return self def get_pointcloud(self): @@ -114,3 +144,56 @@ class CameraObservation: self.projection_rays = project_rays self.projection_rays.copy_(project_rays) + + def stack(self, new_observation: CameraObservation, dim: int = 0): + rgb_image = ( + torch.stack((self.rgb_image, new_observation.rgb_image), dim=dim) + if self.rgb_image is not None + else None + ) + depth_image = ( + torch.stack((self.depth_image, new_observation.depth_image), dim=dim) + if self.depth_image is not None + else None + ) + image_segmentation = ( + torch.stack((self.image_segmentation, new_observation.image_segmentation), dim=dim) + if self.image_segmentation is not None + else None + ) + projection_matrix = ( + torch.stack((self.projection_matrix, new_observation.projection_matrix), dim=dim) + if self.projection_matrix is not None + else None + ) + projection_rays = ( + torch.stack((self.projection_rays, new_observation.projection_rays), dim=dim) + if self.projection_rays is not None + else None + ) + resolution = self.resolution + + pose = self.pose.stack(new_observation.pose) if self.pose is not None else None + + intrinsics = ( + torch.stack((self.intrinsics, new_observation.intrinsics), dim=dim) + if self.intrinsics is not None + else None + ) + timestamp = ( + torch.stack((self.timestamp, new_observation.timestamp), dim=dim) + if self.timestamp is not None + else None + ) + return CameraObservation( + name=self.name, + rgb_image=rgb_image, + depth_image=depth_image, + image_segmentation=image_segmentation, + projection_matrix=projection_matrix, + projection_rays=projection_rays, + resolution=resolution, + pose=pose, + intrinsics=intrinsics, + timestamp=timestamp, + ) diff --git a/src/curobo/types/math.py b/src/curobo/types/math.py index 7311fb2..cd624c9 100644 --- a/src/curobo/types/math.py +++ b/src/curobo/types/math.py @@ -12,7 +12,7 @@ from __future__ import annotations # Standard Library from dataclasses import dataclass -from typing import List, Optional, Sequence +from typing import List, Optional, Sequence, Union # Third Party import numpy as np @@ -35,6 +35,7 @@ from curobo.types.base import TensorDeviceType from curobo.util.helpers import list_idx_if_not_none from curobo.util.logger import log_error, log_info, log_warn from curobo.util.tensor_util import clone_if_not_none, copy_tensor +from curobo.util.torch_utils import get_torch_jit_decorator # Local Folder from .tensor import T_BPosition, T_BQuaternion, T_BRotation @@ -263,8 +264,17 @@ class Pose(Sequence): # rotation=clone_if_not_none(self.rotation), ) - def to(self, tensor_args: TensorDeviceType): - t_type = vars(tensor_args) + def to( + self, + tensor_args: Optional[TensorDeviceType] = None, + device: Optional[torch.device] = None, + ): + if tensor_args is None and device is None: + log_error("Pose.to() requires tensor_args or device") + if tensor_args is not None: + t_type = vars(tensor_args.as_torch_dict()) + else: + t_type = {"device": device} if self.position is not None: self.position = self.position.to(**t_type) if self.quaternion is not None: @@ -338,7 +348,7 @@ class Pose(Sequence): return p_distance, quat_distance def angular_distance(self, other_pose: Pose, use_phi3: bool = False): - """This function computes the angular distance \phi_3. + """This function computes the angular distance phi_3. See Huynh, Du Q. "Metrics for 3D rotations: Comparison and analysis." Journal of Mathematical Imaging and Vision 35 (2009): 155-164. @@ -461,9 +471,9 @@ def quat_multiply(q1, q2, q_res): return q_res -@torch.jit.script +@get_torch_jit_decorator() def angular_distance_phi3(goal_quat, current_quat): - """This function computes the angular distance \phi_3. + """This function computes the angular distance phi_3. See Huynh, Du Q. "Metrics for 3D rotations: Comparison and analysis." Journal of Mathematical Imaging and Vision 35 (2009): 155-164. @@ -524,7 +534,7 @@ class OrientationError(Function): return None, grad_mul, None -@torch.jit.script +@get_torch_jit_decorator() def normalize_quaternion(in_quaternion: torch.Tensor) -> torch.Tensor: k = torch.sign(in_quaternion[..., 0:1]) # NOTE: torch sign returns 0 as sign value when value is 0.0 diff --git a/src/curobo/types/state.py b/src/curobo/types/state.py index 78e3f6f..6faf75f 100644 --- a/src/curobo/types/state.py +++ b/src/curobo/types/state.py @@ -30,6 +30,7 @@ from curobo.util.tensor_util import ( fd_tensor, tensor_repeat_seeds, ) +from curobo.util.torch_utils import get_torch_jit_decorator @dataclass @@ -211,10 +212,10 @@ class JointState(State): j = None v = a = None max_idx = 0 + if isinstance(idx, List): + idx = torch.as_tensor(idx, device=self.position.device, dtype=torch.long) if isinstance(idx, int): max_idx = idx - elif isinstance(idx, List): - max_idx = max(idx) elif isinstance(idx, torch.Tensor): max_idx = torch.max(idx) if max_idx >= self.position.shape[0]: @@ -223,31 +224,19 @@ class JointState(State): + " index out of range, current state is of length " + str(self.position.shape) ) - p = self.position[idx] - if self.velocity is not None: - if max_idx >= self.velocity.shape[0]: - raise ValueError( - str(max_idx) - + " index out of range, current velocity is of length " - + str(self.velocity.shape) - ) - v = self.velocity[idx] - if self.acceleration is not None: - if max_idx >= self.acceleration.shape[0]: - raise ValueError( - str(max_idx) - + " index out of range, current acceleration is of length " - + str(self.acceleration.shape) - ) - a = self.acceleration[idx] - if self.jerk is not None: - if max_idx >= self.jerk.shape[0]: - raise ValueError( - str(max_idx) - + " index out of range, current jerk is of length " - + str(self.jerk.shape) - ) - j = self.jerk[idx] + if isinstance(idx, int): + p, v, a, j = jit_get_index_int( + self.position, self.velocity, self.acceleration, self.jerk, idx + ) + elif isinstance(idx, torch.Tensor): + p, v, a, j = jit_get_index( + self.position, self.velocity, self.acceleration, self.jerk, idx + ) + else: + p, v, a, j = fn_get_index( + self.position, self.velocity, self.acceleration, self.jerk, idx + ) + return JointState(p, v, a, joint_names=self.joint_names, jerk=j) def __len__(self): @@ -514,6 +503,88 @@ class JointState(State): jerk = self.jerk * (dt**3) return JointState(self.position, vel, acc, self.joint_names, jerk, self.tensor_args) + def scale_by_dt(self, dt: torch.Tensor, new_dt: torch.Tensor): + vel, acc, jerk = jit_js_scale(self.velocity, self.acceleration, self.jerk, dt, new_dt) + + return JointState(self.position, vel, acc, self.joint_names, jerk, self.tensor_args) + @property def shape(self): return self.position.shape + + +@get_torch_jit_decorator() +def jit_js_scale( + vel: Union[None, torch.Tensor], + acc: Union[None, torch.Tensor], + jerk: Union[None, torch.Tensor], + dt: torch.Tensor, + new_dt: torch.Tensor, +): + scale_dt = dt / new_dt + if vel is not None: + vel = vel * scale_dt + if acc is not None: + acc = acc * scale_dt * scale_dt + if jerk is not None: + jerk = jerk * scale_dt * scale_dt * scale_dt + return vel, acc, jerk + + +@get_torch_jit_decorator() +def jit_get_index( + position: torch.Tensor, + velocity: Union[torch.Tensor, None], + acc: Union[torch.Tensor, None], + jerk: Union[torch.Tensor, None], + idx: torch.Tensor, +): + + position = position[idx] + if velocity is not None: + velocity = velocity[idx] + if acc is not None: + acc = acc[idx] + if jerk is not None: + jerk = jerk[idx] + + return position, velocity, acc, jerk + + +def fn_get_index( + position: torch.Tensor, + velocity: Union[torch.Tensor, None], + acc: Union[torch.Tensor, None], + jerk: Union[torch.Tensor, None], + idx: torch.Tensor, +): + + position = position[idx] + if velocity is not None: + velocity = velocity[idx] + if acc is not None: + acc = acc[idx] + if jerk is not None: + jerk = jerk[idx] + + return position, velocity, acc, jerk + + +@get_torch_jit_decorator() +def jit_get_index_int( + position: torch.Tensor, + velocity: Union[torch.Tensor, None], + acc: Union[torch.Tensor, None], + jerk: Union[torch.Tensor, None], + idx: int, +): + + position = position[idx] + if velocity is not None: + velocity = velocity[idx] + if acc is not None: + acc = acc[idx] + if jerk is not None: + jerk = jerk[idx] + + return position, velocity, acc, jerk diff --git a/src/curobo/types/tensor.py b/src/curobo/types/tensor.py index ee3da60..cc220a7 100644 --- a/src/curobo/types/tensor.py +++ b/src/curobo/types/tensor.py @@ -9,54 +9,24 @@ # its affiliates is strictly prohibited. # +""" This module contains aliases for structured Tensors, improving readability.""" + +# Third Party +import torch + # CuRobo from curobo.util.logger import log_warn -try: - # Third Party - from torchtyping import TensorType -except ImportError: - log_warn("torchtyping could not be imported, falling back to basic types") - TensorType = None - # Third Party - import torch -b_dof = ["batch", "dof"] -b_value = ["batch", "value"] -bh_value = ["batch", "horizon", "value"] -bh_dof = ["batch", "horizon", "dof"] -h_dof = ["horizon", "dof"] +T_DOF = torch.Tensor #: Tensor of shape [degrees of freedom] +T_BDOF = torch.Tensor #: Tensor of shape [batch, degrees of freedom] +T_BHDOF_float = torch.Tensor #: Tensor of shape [batch, horizon, degrees of freedom] +T_HDOF_float = torch.Tensor #: Tensor of shape [horizon, degrees of freedom] -if TensorType is not None: - T_DOF = TensorType[tuple(["dof"] + [float])] - T_BDOF = TensorType[tuple(b_dof + [float])] - T_BValue_float = TensorType[tuple(b_value + [float])] - T_BHValue_float = TensorType[tuple(bh_value + [float])] - T_BValue_bool = TensorType[tuple(b_value + [bool])] - T_BValue_int = TensorType[tuple(b_value + [int])] +T_BValue_float = torch.Tensor #: Float Tensor of shape [batch, 1]. +T_BHValue_float = torch.Tensor #: Float Tensor of shape [batch, horizon, 1]. +T_BValue_bool = torch.Tensor #: Bool Tensor of shape [batch, horizon, 1]. +T_BValue_int = torch.Tensor #: Int Tensor of shape [batch, horizon, 1]. - T_BPosition = TensorType["batch", "xyz":3, float] - T_BQuaternion = TensorType["batch", "wxyz":4, float] - T_BRotation = TensorType["batch", 3, 3, float] - T_Position = TensorType["xyz":3, float] - T_Quaternion = TensorType["wxyz":4, float] - T_Rotation = TensorType[3, 3, float] - - T_BHDOF_float = TensorType[tuple(bh_dof + [float])] - T_HDOF_float = TensorType[tuple(h_dof + [float])] -else: - T_DOF = torch.Tensor - T_BDOF = torch.Tensor - T_BValue_float = torch.Tensor - T_BHValue_float = torch.Tensor - T_BValue_bool = torch.Tensor - T_BValue_int = torch.Tensor - - T_BPosition = torch.Tensor - T_BQuaternion = torch.Tensor - T_BRotation = torch.Tensor - T_Position = torch.Tensor - T_Quaternion = torch.Tensor - T_Rotation = torch.Tensor - - T_BHDOF_float = torch.Tensor - T_HDOF_float = torch.Tensor +T_BPosition = torch.Tensor #: Tensor of shape [batch, 3]. +T_BQuaternion = torch.Tensor #: Tensor of shape [batch, 4]. +T_BRotation = torch.Tensor #: Tensor of shape [batch, 3,3]. diff --git a/src/curobo/util/sample_lib.py b/src/curobo/util/sample_lib.py index 8db2bb2..44a0ea4 100644 --- a/src/curobo/util/sample_lib.py +++ b/src/curobo/util/sample_lib.py @@ -25,6 +25,7 @@ from torch.distributions.multivariate_normal import MultivariateNormal # CuRobo from curobo.types.base import TensorDeviceType from curobo.util.logger import log_error, log_warn +from curobo.util.torch_utils import get_torch_jit_decorator # Local Folder from ..opt.particle.particle_opt_utils import get_stomp_cov @@ -511,7 +512,7 @@ class HaltonGenerator: return gaussian_halton_samples -@torch.jit.script +@get_torch_jit_decorator() def gaussian_transform( uniform_samples: torch.Tensor, proj_mat: torch.Tensor, i_mat: torch.Tensor, std_dev: float ): diff --git a/src/curobo/util/tensor_util.py b/src/curobo/util/tensor_util.py index 6cec216..e55e053 100644 --- a/src/curobo/util/tensor_util.py +++ b/src/curobo/util/tensor_util.py @@ -14,6 +14,9 @@ from typing import List # Third Party import torch +# CuRobo +from curobo.util.torch_utils import get_torch_jit_decorator + def check_tensor_shapes(new_tensor: torch.Tensor, mem_tensor: torch.Tensor): if not isinstance(mem_tensor, torch.Tensor): @@ -65,13 +68,19 @@ def clone_if_not_none(x): return None -@torch.jit.script +@get_torch_jit_decorator() def cat_sum(tensor_list: List[torch.Tensor]): cat_tensor = torch.sum(torch.stack(tensor_list, dim=0), dim=0) return cat_tensor -@torch.jit.script +@get_torch_jit_decorator() +def cat_sum_horizon(tensor_list: List[torch.Tensor]): + cat_tensor = torch.sum(torch.stack(tensor_list, dim=0), dim=(0, -1)) + return cat_tensor + + +@get_torch_jit_decorator() def cat_max(tensor_list: List[torch.Tensor]): cat_tensor = torch.max(torch.stack(tensor_list, dim=0), dim=0)[0] return cat_tensor @@ -85,7 +94,7 @@ def tensor_repeat_seeds(tensor, num_seeds): ) -@torch.jit.script +@get_torch_jit_decorator() def fd_tensor(p: torch.Tensor, dt: torch.Tensor): out = ((torch.roll(p, -1, -2) - p) * (1 / dt).unsqueeze(-1))[..., :-1, :] return out diff --git a/src/curobo/util/torch_utils.py b/src/curobo/util/torch_utils.py index ae221d5..1f1fc1a 100644 --- a/src/curobo/util/torch_utils.py +++ b/src/curobo/util/torch_utils.py @@ -8,12 +8,15 @@ # without an express license agreement from NVIDIA CORPORATION or # its affiliates is strictly prohibited. # +# Standard Library +import os + # Third Party import torch from packaging import version # CuRobo -from curobo.util.logger import log_warn +from curobo.util.logger import log_info, log_warn def find_first_idx(array, value, EQUAL=False): @@ -31,13 +34,119 @@ def find_last_idx(array, value): def is_cuda_graph_available(): if version.parse(torch.__version__) < version.parse("1.10"): - log_warn("WARNING: Disabling CUDA Graph as pytorch < 1.10") + log_warn("Disabling CUDA Graph as pytorch < 1.10") return False return True def is_torch_compile_available(): + force_compile = os.environ.get("CUROBO_TORCH_COMPILE_FORCE") + if force_compile is not None and bool(int(force_compile)): + return True if version.parse(torch.__version__) < version.parse("2.0"): - log_warn("WARNING: Disabling compile as pytorch < 2.0") + log_info("Disabling torch.compile as pytorch < 2.0") return False + + env_variable = os.environ.get("CUROBO_TORCH_COMPILE_DISABLE") + + if env_variable is None: + log_info("Environment variable for CUROBO_TORCH_COMPILE is not set, Disabling.") + + return False + + if bool(int(env_variable)): + log_info("Environment variable for CUROBO_TORCH_COMPILE is set to Disable") + return False + + log_info("Environment variable for CUROBO_TORCH_COMPILE is set to Enable") + + try: + torch.compile + except: + log_info("Could not find torch.compile, disabling Torch Compile.") + return False + try: + torch._dynamo + except: + log_info("Could not find torch._dynamo, disabling Torch Compile.") + return False + try: + # Third Party + import triton + except: + log_info("Could not find triton, disabling Torch Compile.") + return False + return True + + +def get_torch_compile_options() -> dict: + options = {} + if is_torch_compile_available(): + # Third Party + from torch._inductor import config + + torch._dynamo.config.suppress_errors = True + use_options = { + "max_autotune": True, + "use_mixed_mm": True, + "conv_1x1_as_mm": True, + "coordinate_descent_tuning": True, + "epilogue_fusion": False, + "coordinate_descent_check_all_directions": True, + "force_fuse_int_mm_with_mul": True, + "triton.cudagraphs": False, + "aggressive_fusion": True, + "split_reductions": False, + "worker_start_method": "spawn", + } + for k in use_options.keys(): + if hasattr(config, k): + options[k] = use_options[k] + else: + log_info("Not found in torch.compile: " + k) + return options + + +def disable_torch_compile_global(): + if is_torch_compile_available(): + torch._dynamo.config.disable = True + return True + return False + + +def set_torch_compile_global_options(): + if is_torch_compile_available(): + # Third Party + from torch._inductor import config + + torch._dynamo.config.suppress_errors = True + if hasattr(config, "conv_1x1_as_mm"): + torch._inductor.config.conv_1x1_as_mm = True + if hasattr(config, "coordinate_descent_tuning"): + torch._inductor.config.coordinate_descent_tuning = True + if hasattr(config, "epilogue_fusion"): + torch._inductor.config.epilogue_fusion = False + if hasattr(config, "coordinate_descent_check_all_directions"): + torch._inductor.config.coordinate_descent_check_all_directions = True + if hasattr(config, "force_fuse_int_mm_with_mul"): + torch._inductor.config.force_fuse_int_mm_with_mul = True + if hasattr(config, "use_mixed_mm"): + torch._inductor.config.use_mixed_mm = True + return True + return False + + +def get_torch_jit_decorator( + force_jit: bool = False, dynamic: bool = True, only_valid_for_compile: bool = False +): + if not force_jit and is_torch_compile_available(): + return torch.compile(options=get_torch_compile_options(), dynamic=dynamic) + elif not only_valid_for_compile: + return torch.jit.script + else: + return empty_decorator + + +def empty_decorator(function): + return function diff --git a/src/curobo/util/trajectory.py b/src/curobo/util/trajectory.py index 619913c..effc1d0 100644 --- a/src/curobo/util/trajectory.py +++ b/src/curobo/util/trajectory.py @@ -25,6 +25,7 @@ from curobo.types.base import TensorDeviceType from curobo.types.robot import JointState from curobo.util.logger import log_error, log_info, log_warn from curobo.util.sample_lib import bspline +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util.warp_interpolation import get_cuda_linear_interpolation @@ -114,7 +115,7 @@ def get_spline_interpolated_trajectory(raw_traj, des_horizon, degree=5): for i in range(cpu_traj.shape[-1]): retimed_traj[:, i] = bspline(cpu_traj[:, i], n=des_horizon, degree=degree) - retimed_traj = retimed_traj.to(**vars(tensor_args)) + retimed_traj = retimed_traj.to(**(tensor_args.as_torch_dict())) return retimed_traj @@ -385,7 +386,7 @@ def get_interpolated_trajectory( kind=kind, last_step=des_horizon, ) - retimed_traj = retimed_traj.to(**vars(tensor_args)) + retimed_traj = retimed_traj.to(**(tensor_args.as_torch_dict())) out_traj_state.position[b, :interpolation_steps, :] = retimed_traj out_traj_state.position[b, interpolation_steps:, :] = retimed_traj[ interpolation_steps - 1 : interpolation_steps, : @@ -438,7 +439,39 @@ def linear_smooth( return y_new -@torch.jit.script +@get_torch_jit_decorator() +def calculate_dt_fixed( + vel: torch.Tensor, + acc: torch.Tensor, + jerk: torch.Tensor, + max_vel: torch.Tensor, + max_acc: torch.Tensor, + max_jerk: torch.Tensor, + raw_dt: torch.Tensor, + interpolation_dt: float, +): + # compute scaled dt: + max_v_arr = torch.max(torch.abs(vel), dim=-2)[0] # output is batch, dof + + max_acc_arr = torch.max(torch.abs(acc), dim=-2)[0] + max_jerk_arr = torch.max(torch.abs(jerk), dim=-2)[0] + + vel_scale_dt = (max_v_arr) / (max_vel.view(1, max_v_arr.shape[-1])) # batch,dof + acc_scale_dt = max_acc_arr / (max_acc.view(1, max_acc_arr.shape[-1])) + jerk_scale_dt = max_jerk_arr / (max_jerk.view(1, max_jerk_arr.shape[-1])) + + dt_score_vel = raw_dt * torch.max(vel_scale_dt, dim=-1)[0] # batch, 1 + dt_score_acc = raw_dt * torch.sqrt((torch.max(acc_scale_dt, dim=-1)[0])) + dt_score_jerk = raw_dt * torch.pow((torch.max(jerk_scale_dt, dim=-1)[0]), 1 / 3) + dt_score = torch.maximum(dt_score_vel, dt_score_acc) + dt_score = torch.maximum(dt_score, dt_score_jerk) + dt_score = torch.clamp(dt_score, interpolation_dt, raw_dt) + # NOTE: this dt score is not dt, rather a scaling to convert velocity, acc, jerk that was + # computed with raw_dt to a new dt + return dt_score + + +@get_torch_jit_decorator(force_jit=True) def calculate_dt( vel: torch.Tensor, acc: torch.Tensor, @@ -470,7 +503,7 @@ def calculate_dt( return dt_score -@torch.jit.script +@get_torch_jit_decorator(force_jit=True) def calculate_dt_no_clamp( vel: torch.Tensor, acc: torch.Tensor, @@ -497,7 +530,7 @@ def calculate_dt_no_clamp( return dt_score -@torch.jit.script +@get_torch_jit_decorator() def calculate_tsteps( vel: torch.Tensor, acc: torch.Tensor, @@ -506,13 +539,15 @@ def calculate_tsteps( max_vel: torch.Tensor, max_acc: torch.Tensor, max_jerk: torch.Tensor, - raw_dt: float, + raw_dt: torch.Tensor, min_dt: float, horizon: int, optimize_dt: bool = True, ): # compute scaled dt: - opt_dt = calculate_dt(vel, acc, jerk, max_vel, max_acc, max_jerk, raw_dt, interpolation_dt) + opt_dt = calculate_dt_fixed( + vel, acc, jerk, max_vel, max_acc, max_jerk, raw_dt, interpolation_dt + ) if not optimize_dt: opt_dt[:] = raw_dt traj_steps = (torch.ceil((horizon - 1) * ((opt_dt) / interpolation_dt))).to(dtype=torch.int32) diff --git a/src/curobo/util_file.py b/src/curobo/util_file.py index b608ed8..42a895d 100644 --- a/src/curobo/util_file.py +++ b/src/curobo/util_file.py @@ -11,6 +11,7 @@ # Standard Library import os import shutil +import sys from typing import Dict, List # Third Party diff --git a/src/curobo/wrap/model/robot_segmenter.py b/src/curobo/wrap/model/robot_segmenter.py index 8efa7fb..173ff03 100644 --- a/src/curobo/wrap/model/robot_segmenter.py +++ b/src/curobo/wrap/model/robot_segmenter.py @@ -19,13 +19,16 @@ from torch.profiler import record_function # CuRobo from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel from curobo.geom.cv import get_projection_rays, project_depth_using_rays -from curobo.geom.types import PointCloud from curobo.types.base import TensorDeviceType from curobo.types.camera import CameraObservation -from curobo.types.math import Pose from curobo.types.robot import RobotConfig from curobo.types.state import JointState from curobo.util.logger import log_error +from curobo.util.torch_utils import ( + get_torch_compile_options, + get_torch_jit_decorator, + is_torch_compile_available, +) from curobo.util_file import get_robot_configs_path, join_path, load_yaml from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig @@ -36,6 +39,7 @@ class RobotSegmenter: robot_world: RobotWorld, distance_threshold: float = 0.05, use_cuda_graph: bool = True, + ops_dtype: torch.dtype = torch.float32, ): self._robot_world = robot_world self._projection_rays = None @@ -48,11 +52,12 @@ class RobotSegmenter: self._use_cuda_graph = use_cuda_graph self.tensor_args = robot_world.tensor_args self.distance_threshold = distance_threshold + self._ops_dtype = ops_dtype @staticmethod def from_robot_file( robot_file: Union[str, Dict], - collision_sphere_buffer: Optional[float], + collision_sphere_buffer: Optional[float] = None, distance_threshold: float = 0.05, use_cuda_graph: bool = True, tensor_args: TensorDeviceType = TensorDeviceType(), @@ -78,7 +83,7 @@ class RobotSegmenter: def get_pointcloud_from_depth(self, camera_obs: CameraObservation): if self._projection_rays is None: self.update_camera_projection(camera_obs) - depth_image = camera_obs.depth_image + depth_image = camera_obs.depth_image.to(dtype=self._ops_dtype) if len(depth_image.shape) == 2: depth_image = depth_image.unsqueeze(0) points = project_depth_using_rays(depth_image, self._projection_rays) @@ -91,7 +96,7 @@ class RobotSegmenter: intrinsics = intrinsics.unsqueeze(0) project_rays = get_projection_rays( camera_obs.depth_image.shape[-2], camera_obs.depth_image.shape[-1], intrinsics - ) + ).to(dtype=self._ops_dtype) if self._projection_rays is None: self._projection_rays = project_rays @@ -157,8 +162,12 @@ class RobotSegmenter: def _mask_op(self, camera_obs, q): if len(q.shape) == 1: q = q.unsqueeze(0) + + robot_spheres = self._robot_world.get_kinematics(q).link_spheres_tensor + points = self.get_pointcloud_from_depth(camera_obs) camera_to_robot = camera_obs.pose + points = points.to(dtype=torch.float32) if self._out_points_buffer is None: self._out_points_buffer = points.clone() @@ -181,9 +190,9 @@ class RobotSegmenter: out_points = points_in_robot_frame - dist = self._robot_world.get_point_robot_distance(out_points, q) - - mask, filtered_image = mask_image(camera_obs.depth_image, dist, self.distance_threshold) + mask, filtered_image = mask_spheres_image( + camera_obs.depth_image, robot_spheres, out_points, self.distance_threshold + ) return mask, filtered_image @@ -200,7 +209,7 @@ class RobotSegmenter: return self.kinematics.base_link -@torch.jit.script +@get_torch_jit_decorator() def mask_image( image: torch.Tensor, distance: torch.Tensor, distance_threshold: float ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -212,3 +221,36 @@ def mask_image( mask = torch.logical_and((image > 0.0), (distance > -distance_threshold)) filtered_image = torch.where(mask, 0, image) return mask, filtered_image + + +@get_torch_jit_decorator() +def mask_spheres_image( + image: torch.Tensor, + link_spheres_tensor: torch.Tensor, + points: torch.Tensor, + distance_threshold: float, +) -> Tuple[torch.Tensor, torch.Tensor]: + + if link_spheres_tensor.shape[0] != 1: + assert link_spheres_tensor.shape[0] == points.shape[0] + if len(points.shape) == 2: + points = points.unsqueeze(0) + + robot_spheres = link_spheres_tensor.view(link_spheres_tensor.shape[0], -1, 4).contiguous() + robot_spheres = robot_spheres.unsqueeze(-3) + + robot_radius = robot_spheres[..., 3] + points = points.unsqueeze(-2) + sph_distance = -1 * ( + torch.linalg.norm(points - robot_spheres[..., :3], dim=-1) - robot_radius + ) # b, n_spheres + distance = torch.max(sph_distance, dim=-1)[0] + + distance = distance.view( + image.shape[0], + image.shape[1], + image.shape[2], + ) + mask = torch.logical_and((image > 0.0), (distance > -distance_threshold)) + filtered_image = torch.where(mask, 0, image) + return mask, filtered_image diff --git a/src/curobo/wrap/model/robot_world.py b/src/curobo/wrap/model/robot_world.py index 2deba06..5900712 100644 --- a/src/curobo/wrap/model/robot_world.py +++ b/src/curobo/wrap/model/robot_world.py @@ -36,6 +36,11 @@ from curobo.types.robot import RobotConfig from curobo.types.state import JointState from curobo.util.logger import log_error from curobo.util.sample_lib import HaltonGenerator +from curobo.util.torch_utils import ( + get_torch_compile_options, + get_torch_jit_decorator, + is_torch_compile_available, +) from curobo.util.warp import init_warp from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml @@ -192,6 +197,9 @@ class RobotWorld(RobotWorldConfig): def update_world(self, world_config: WorldConfig): self.world_model.load_collision_model(world_config) + def clear_world_cache(self): + self.world_model.clear_cache() + def get_collision_distance( self, x_sph: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None ) -> torch.Tensor: @@ -364,16 +372,7 @@ class RobotWorld(RobotWorldConfig): if len(q.shape) == 1: log_error("q should be of shape [b, dof]") kin_state = self.get_kinematics(q) - b, n = None, None - if len(points.shape) == 3: - b, n, _ = points.shape - points = points.view(b * n, 3) - pt_distance = point_robot_distance(kin_state.link_spheres_tensor, points) - - if b is not None: - pt_distance = pt_distance.view(b, n) - return pt_distance def get_active_js(self, full_js: JointState): @@ -382,27 +381,50 @@ class RobotWorld(RobotWorldConfig): return out_js -@torch.jit.script +@get_torch_jit_decorator() def sum_mask(d1, d2, d3): d_total = d1 + d2 + d3 d_mask = d_total == 0.0 return d_mask.view(-1) -@torch.jit.script +@get_torch_jit_decorator() def mask(d1, d2, d3): d_total = d1 + d2 + d3 d_mask = d_total == 0.0 return d_mask -@torch.jit.script +@get_torch_jit_decorator() def point_robot_distance(link_spheres_tensor, points): - robot_spheres = link_spheres_tensor.view(1, -1, 4).contiguous() - robot_radius = robot_spheres[:, :, 3] - points = points.unsqueeze(1) - sph_distance = ( - torch.linalg.norm(points - robot_spheres[:, :, :3], dim=-1) - robot_radius + """Compute distance between robot and points + + Args: + link_spheres_tensor: [batch_robot, n_robot_spheres, 4] + points: [batch_points, n_points, 3] + + Returns: + distance: [batch_points, n_points] + """ + if link_spheres_tensor.shape[0] != 1: + assert link_spheres_tensor.shape[0] == points.shape[0] + squeeze_shape = False + n = 1 + if len(points.shape) == 2: + squeeze_shape = True + n, _ = points.shape + points = points.unsqueeze(0) + + robot_spheres = link_spheres_tensor.view(link_spheres_tensor.shape[0], -1, 4).contiguous() + robot_spheres = robot_spheres.unsqueeze(-3) + + robot_radius = robot_spheres[..., 3] + points = points.unsqueeze(-2) + sph_distance = -1 * ( + torch.linalg.norm(points - robot_spheres[..., :3], dim=-1) - robot_radius ) # b, n_spheres - pt_distance = torch.max(-1 * sph_distance, dim=-1)[0] + pt_distance = torch.max(sph_distance, dim=-1)[0] + + if squeeze_shape: + pt_distance = pt_distance.view(n) return pt_distance diff --git a/src/curobo/wrap/reacher/evaluator.py b/src/curobo/wrap/reacher/evaluator.py index 2622efa..66926fb 100644 --- a/src/curobo/wrap/reacher/evaluator.py +++ b/src/curobo/wrap/reacher/evaluator.py @@ -20,6 +20,7 @@ import torch.autograd.profiler as profiler # CuRobo from curobo.types.robot import JointState from curobo.types.tensor import T_DOF +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util.trajectory import calculate_dt @@ -32,7 +33,7 @@ class TrajEvaluatorConfig: max_dt: float = 0.1 -@torch.jit.script +@get_torch_jit_decorator() def compute_path_length(vel, traj_dt, cspace_distance_weight): pl = torch.sum( torch.sum(torch.abs(vel) * traj_dt.unsqueeze(-1) * cspace_distance_weight, dim=-1), dim=-1 @@ -40,24 +41,25 @@ def compute_path_length(vel, traj_dt, cspace_distance_weight): return pl -@torch.jit.script +@get_torch_jit_decorator() def compute_path_length_cost(vel, cspace_distance_weight): pl = torch.sum(torch.sum(torch.abs(vel) * cspace_distance_weight, dim=-1), dim=-1) return pl -@torch.jit.script +@get_torch_jit_decorator() def smooth_cost(abs_acc, abs_jerk, opt_dt): # acc = torch.max(torch.max(abs_acc, dim=-1)[0], dim=-1)[0] # jerk = torch.max(torch.max(abs_jerk, dim=-1)[0], dim=-1)[0] jerk = torch.mean(torch.max(abs_jerk, dim=-1)[0], dim=-1) mean_acc = torch.mean(torch.max(abs_acc, dim=-1)[0], dim=-1) # [0] - a = (jerk * 0.001) + 5.0 * opt_dt + (mean_acc * 0.01) + a = (jerk * 0.001) + 10.0 * opt_dt + (mean_acc * 0.01) + # a = (jerk * 0.001) + 50.0 * opt_dt + (mean_acc * 0.01) return a -@torch.jit.script +@get_torch_jit_decorator() def compute_smoothness( vel: torch.Tensor, acc: torch.Tensor, @@ -104,7 +106,7 @@ def compute_smoothness( return (acc_label, smooth_cost(abs_acc, abs_jerk, dt_score)) -@torch.jit.script +@get_torch_jit_decorator() def compute_smoothness_opt_dt( vel, acc, jerk, max_vel: torch.Tensor, max_acc: float, max_jerk: float, opt_dt: torch.Tensor ): diff --git a/src/curobo/wrap/reacher/ik_solver.py b/src/curobo/wrap/reacher/ik_solver.py index 9192dc1..0ad5635 100644 --- a/src/curobo/wrap/reacher/ik_solver.py +++ b/src/curobo/wrap/reacher/ik_solver.py @@ -34,6 +34,7 @@ from curobo.types.robot import JointState, RobotConfig from curobo.types.tensor import T_BDOF, T_BValue_bool, T_BValue_float from curobo.util.logger import log_error, log_warn from curobo.util.sample_lib import HaltonGenerator +from curobo.util.torch_utils import get_torch_jit_decorator from curobo.util_file import ( get_robot_configs_path, get_task_configs_path, @@ -1010,7 +1011,7 @@ class IKSolver(IKSolverConfig): ] -@torch.jit.script +@get_torch_jit_decorator() def get_success( feasible, position_error, @@ -1028,7 +1029,7 @@ def get_success( return success -@torch.jit.script +@get_torch_jit_decorator() def get_result( pose_error, position_error, diff --git a/src/curobo/wrap/reacher/motion_gen.py b/src/curobo/wrap/reacher/motion_gen.py index b152e09..35c6906 100644 --- a/src/curobo/wrap/reacher/motion_gen.py +++ b/src/curobo/wrap/reacher/motion_gen.py @@ -197,7 +197,7 @@ class MotionGenConfig: smooth_weight: List[float] = None, finetune_smooth_weight: Optional[List[float]] = None, state_finite_difference_mode: Optional[str] = None, - finetune_dt_scale: float = 0.98, + finetune_dt_scale: float = 0.95, maximum_trajectory_time: Optional[float] = None, maximum_trajectory_dt: float = 0.1, velocity_scale: Optional[Union[List[float], float]] = None, @@ -2086,6 +2086,7 @@ class MotionGen(MotionGenConfig): # self.trajopt_solver.compute_metrics(not og_evaluate, og_evaluate) if self.store_debug_in_result: result.debug_info["trajopt_result"] = traj_result + # run finetune if plan_config.enable_finetune_trajopt and torch.count_nonzero(traj_result.success) > 0: with profiler.record_function("motion_gen/finetune_trajopt"): @@ -2113,6 +2114,9 @@ class MotionGen(MotionGenConfig): traj_result.solve_time = og_solve_time if self.store_debug_in_result: result.debug_info["finetune_trajopt_result"] = traj_result + elif plan_config.enable_finetune_trajopt: + traj_result.success = traj_result.success[:, 0] + result.success = traj_result.success result.interpolated_plan = traj_result.interpolated_solution @@ -2190,7 +2194,7 @@ class MotionGen(MotionGenConfig): # warm up js_trajopt: goal_state = start_state.clone() goal_state.position[..., warmup_joint_index] += warmup_joint_delta - for _ in range(2): + for _ in range(3): self.plan_single_js(start_state, goal_state, MotionGenPlanConfig(max_attempts=1)) if enable_graph: start_state = JointState.from_position( @@ -2214,7 +2218,7 @@ class MotionGen(MotionGenConfig): if n_goalset == -1: retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq) start_state.position[..., warmup_joint_index] += warmup_joint_delta - for _ in range(2): + for _ in range(3): self.plan_single( start_state, retract_pose, @@ -2243,7 +2247,7 @@ class MotionGen(MotionGenConfig): quaternion=state.ee_quat_seq.repeat(n_goalset, 1).view(1, n_goalset, 4), ) start_state.position[..., warmup_joint_index] += warmup_joint_delta - for _ in range(2): + for _ in range(3): self.plan_goalset( start_state, retract_pose, @@ -2278,7 +2282,7 @@ class MotionGen(MotionGenConfig): retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq) start_state.position[..., warmup_joint_index] += warmup_joint_delta - for _ in range(2): + for _ in range(3): if batch_env_mode: self.plan_batch_env( start_state, @@ -2307,7 +2311,7 @@ class MotionGen(MotionGenConfig): .contiguous(), ) start_state.position[..., warmup_joint_index] += warmup_joint_delta - for _ in range(2): + for _ in range(3): if batch_env_mode: self.plan_batch_env_goalset( start_state, diff --git a/src/curobo/wrap/reacher/trajopt.py b/src/curobo/wrap/reacher/trajopt.py index 5e3bc78..67f1019 100644 --- a/src/curobo/wrap/reacher/trajopt.py +++ b/src/curobo/wrap/reacher/trajopt.py @@ -41,6 +41,7 @@ from curobo.types.robot import JointState, RobotConfig from curobo.types.tensor import T_BDOF, T_DOF, T_BValue_bool, T_BValue_float from curobo.util.helpers import list_idx_if_not_none from curobo.util.logger import log_error, log_info, log_warn +from curobo.util.torch_utils import get_torch_jit_decorator, is_torch_compile_available from curobo.util.trajectory import ( InterpolateType, calculate_dt_no_clamp, @@ -877,24 +878,37 @@ class TrajOptSolver(TrajOptSolverConfig): result.metrics.goalset_index = metrics.goalset_index st_time = time.time() - feasible = torch.all(result.metrics.feasible, dim=-1) + if result.metrics.cspace_error is None and result.metrics.position_error is None: + raise log_error("convergence check requires either goal_pose or goal_state") - if result.metrics.position_error is not None: - converge = torch.logical_and( - result.metrics.position_error[..., -1] <= self.position_threshold, - result.metrics.rotation_error[..., -1] <= self.rotation_threshold, - ) - elif result.metrics.cspace_error is not None: - converge = result.metrics.cspace_error[..., -1] <= self.cspace_threshold - else: - raise ValueError("convergence check requires either goal_pose or goal_state") + success = jit_feasible_success( + result.metrics.feasible, + result.metrics.position_error, + result.metrics.rotation_error, + result.metrics.cspace_error, + self.position_threshold, + self.rotation_threshold, + self.cspace_threshold, + ) + if False: + feasible = torch.all(result.metrics.feasible, dim=-1) - success = torch.logical_and(feasible, converge) + if result.metrics.position_error is not None: + converge = torch.logical_and( + result.metrics.position_error[..., -1] <= self.position_threshold, + result.metrics.rotation_error[..., -1] <= self.rotation_threshold, + ) + elif result.metrics.cspace_error is not None: + converge = result.metrics.cspace_error[..., -1] <= self.cspace_threshold + else: + raise ValueError("convergence check requires either goal_pose or goal_state") + + success = torch.logical_and(feasible, converge) if return_all_solutions: traj_result = TrajResult( success=success, goal=goal, - solution=result.action.scale(self.solver_dt / opt_dt.view(-1, 1, 1)), + solution=result.action.scale_by_dt(self.solver_dt_tensor, opt_dt.view(-1, 1, 1)), seed=seed_traj, position_error=result.metrics.position_error, rotation_error=result.metrics.rotation_error, @@ -928,49 +942,86 @@ class TrajOptSolver(TrajOptSolverConfig): ) with profiler.record_function("trajopt/best_select"): - success[~smooth_label] = False - # get the best solution: - if result.metrics.pose_error is not None: - convergence_error = result.metrics.pose_error[..., -1] - elif result.metrics.cspace_error is not None: - convergence_error = result.metrics.cspace_error[..., -1] + if True: # not get_torch_jit_decorator() == torch.jit.script: + # This only works if torch compile is available: + ( + idx, + position_error, + rotation_error, + cspace_error, + goalset_index, + opt_dt, + success, + ) = jit_trajopt_best_select( + success, + smooth_label, + result.metrics.cspace_error, + result.metrics.pose_error, + result.metrics.position_error, + result.metrics.rotation_error, + result.metrics.goalset_index, + result.metrics.cost, + smooth_cost, + batch_mode, + goal.batch, + num_seeds, + self._col, + opt_dt, + ) + if batch_mode: + last_tstep = [last_tstep[i] for i in idx] + else: + last_tstep = [last_tstep[idx.item()]] + best_act_seq = result.action[idx] + best_raw_action = result.raw_action[idx] + interpolated_traj = interpolated_trajs[idx] + else: - raise ValueError("convergence check requires either goal_pose or goal_state") - running_cost = torch.mean(result.metrics.cost, dim=-1) * 0.0001 - error = convergence_error + smooth_cost + running_cost - error[~success] += 10000.0 - if batch_mode: - idx = torch.argmin(error.view(goal.batch, num_seeds), dim=-1) - idx = idx + num_seeds * self._col - last_tstep = [last_tstep[i] for i in idx] - success = success[idx] - else: - idx = torch.argmin(error, dim=0) + success[~smooth_label] = False + # get the best solution: + if result.metrics.pose_error is not None: + convergence_error = result.metrics.pose_error[..., -1] + elif result.metrics.cspace_error is not None: + convergence_error = result.metrics.cspace_error[..., -1] + else: + raise ValueError( + "convergence check requires either goal_pose or goal_state" + ) + running_cost = torch.mean(result.metrics.cost, dim=-1) * 0.0001 + error = convergence_error + smooth_cost + running_cost + error[~success] += 10000.0 + if batch_mode: + idx = torch.argmin(error.view(goal.batch, num_seeds), dim=-1) + idx = idx + num_seeds * self._col + last_tstep = [last_tstep[i] for i in idx] + success = success[idx] + else: + idx = torch.argmin(error, dim=0) - last_tstep = [last_tstep[idx.item()]] - success = success[idx : idx + 1] + last_tstep = [last_tstep[idx.item()]] + success = success[idx : idx + 1] - best_act_seq = result.action[idx] - best_raw_action = result.raw_action[idx] - interpolated_traj = interpolated_trajs[idx] - goalset_index = position_error = rotation_error = cspace_error = None - if result.metrics.position_error is not None: - position_error = result.metrics.position_error[idx, -1] - if result.metrics.rotation_error is not None: - rotation_error = result.metrics.rotation_error[idx, -1] - if result.metrics.cspace_error is not None: - cspace_error = result.metrics.cspace_error[idx, -1] - if result.metrics.goalset_index is not None: - goalset_index = result.metrics.goalset_index[idx, -1] + best_act_seq = result.action[idx] + best_raw_action = result.raw_action[idx] + interpolated_traj = interpolated_trajs[idx] + goalset_index = position_error = rotation_error = cspace_error = None + if result.metrics.position_error is not None: + position_error = result.metrics.position_error[idx, -1] + if result.metrics.rotation_error is not None: + rotation_error = result.metrics.rotation_error[idx, -1] + if result.metrics.cspace_error is not None: + cspace_error = result.metrics.cspace_error[idx, -1] + if result.metrics.goalset_index is not None: + goalset_index = result.metrics.goalset_index[idx, -1] - opt_dt = opt_dt[idx] + opt_dt = opt_dt[idx] if self.sync_cuda_time: torch.cuda.synchronize() if len(best_act_seq.shape) == 3: opt_dt_v = opt_dt.view(-1, 1, 1) else: opt_dt_v = opt_dt.view(1, 1) - opt_solution = best_act_seq.scale(self.solver_dt / opt_dt_v) + opt_solution = best_act_seq.scale_by_dt(self.solver_dt_tensor, opt_dt_v) select_time = time.time() - st_time debug_info = None if self.store_debug_in_result: @@ -1174,7 +1225,7 @@ class TrajOptSolver(TrajOptSolverConfig): self._max_joint_vel, self._max_joint_acc, self._max_joint_jerk, - self.solver_dt, + self.solver_dt_tensor, kind=self.interpolation_type, tensor_args=self.tensor_args, out_traj_state=self._interpolated_traj_buffer, @@ -1224,7 +1275,12 @@ class TrajOptSolver(TrajOptSolverConfig): @property def solver_dt(self): - return self.solver.safety_rollout.dynamics_model.dt_traj_params.base_dt + return self.solver.safety_rollout.dynamics_model.traj_dt[0] + # return self.solver.safety_rollout.dynamics_model.dt_traj_params.base_dt + + @property + def solver_dt_tensor(self): + return self.solver.safety_rollout.dynamics_model.traj_dt[0] def update_solver_dt( self, @@ -1254,3 +1310,79 @@ class TrajOptSolver(TrajOptSolverConfig): for rollout in rollouts if isinstance(rollout, ArmReacher) ] + + +@get_torch_jit_decorator() +def jit_feasible_success( + feasible, + position_error: Union[torch.Tensor, None], + rotation_error: Union[torch.Tensor, None], + cspace_error: Union[torch.Tensor, None], + position_threshold: float, + rotation_threshold: float, + cspace_threshold: float, +): + feasible = torch.all(feasible, dim=-1) + converge = feasible + if position_error is not None and rotation_error is not None: + converge = torch.logical_and( + position_error[..., -1] <= position_threshold, + rotation_error[..., -1] <= rotation_threshold, + ) + elif cspace_error is not None: + converge = cspace_error[..., -1] <= cspace_threshold + + success = torch.logical_and(feasible, converge) + return success + + +@get_torch_jit_decorator(only_valid_for_compile=True) +def jit_trajopt_best_select( + success, + smooth_label, + cspace_error: Union[torch.Tensor, None], + pose_error: Union[torch.Tensor, None], + position_error: Union[torch.Tensor, None], + rotation_error: Union[torch.Tensor, None], + goalset_index: Union[torch.Tensor, None], + cost, + smooth_cost, + batch_mode: bool, + batch: int, + num_seeds: int, + col, + opt_dt, +): + success[~smooth_label] = False + convergence_error = 0 + # get the best solution: + if pose_error is not None: + convergence_error = pose_error[..., -1] + elif cspace_error is not None: + convergence_error = cspace_error[..., -1] + + running_cost = torch.mean(cost, dim=-1) * 0.0001 + error = convergence_error + smooth_cost + running_cost + error[~success] += 10000.0 + if batch_mode: + idx = torch.argmin(error.view(batch, num_seeds), dim=-1) + idx = idx + num_seeds * col + success = success[idx] + else: + idx = torch.argmin(error, dim=0) + + success = success[idx : idx + 1] + + # goalset_index = position_error = rotation_error = cspace_error = None + if position_error is not None: + position_error = position_error[idx, -1] + if rotation_error is not None: + rotation_error = rotation_error[idx, -1] + if cspace_error is not None: + cspace_error = cspace_error[idx, -1] + if goalset_index is not None: + goalset_index = goalset_index[idx, -1] + + opt_dt = opt_dt[idx] + + return idx, position_error, rotation_error, cspace_error, goalset_index, opt_dt, success diff --git a/tests/cost_test.py b/tests/cost_test.py index 5a38b93..e1df675 100644 --- a/tests/cost_test.py +++ b/tests/cost_test.py @@ -41,7 +41,7 @@ def test_primitive_collision_cost(): ) cost = PrimitiveCollisionCost(cost_cfg) q_spheres = torch.as_tensor( - [[0.1, 0.0, 0.0, 0.2], [10.0, 0.0, 0.0, 0.1]], **vars(tensor_args) + [[0.1, 0.0, 0.0, 0.2], [10.0, 0.0, 0.0, 0.1]], **(tensor_args.as_torch_dict()) ).view(-1, 1, 1, 4) c = cost.forward(q_spheres).flatten() assert c[0] > 0.0 and c[1] == 0.0 diff --git a/tests/geom_test.py b/tests/geom_test.py index 0d08e69..9beb0b2 100644 --- a/tests/geom_test.py +++ b/tests/geom_test.py @@ -33,7 +33,8 @@ def test_world_primitive(): coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType()) coll_check = WorldPrimitiveCollision(coll_cfg) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, 1, -1, 4) # create buffers: query_buffer = CollisionQueryBuffer.initialize_from_shape( @@ -58,7 +59,8 @@ def test_batch_world_primitive(): coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType()) coll_check = WorldPrimitiveCollision(coll_cfg) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(-1, 1, 1, 4) env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32) # create buffers: @@ -90,7 +92,8 @@ def test_swept_world_primitive(): new_cube = Cuboid("cube_1", [0, 0, 1, 1, 0, 0, 0], None, [0.1, 0.2, 0.2]) coll_check.add_obb(new_cube, 0) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, 1, -1, 4) env_query_idx = None env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32) @@ -125,7 +128,8 @@ def test_world_primitive_mesh_instance(): coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType()) coll_check = WorldMeshCollision(coll_cfg) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, 1, -1, 4) # create buffers: query_buffer = CollisionQueryBuffer.initialize_from_shape( @@ -150,7 +154,8 @@ def test_batch_world_primitive_mesh_instance(): coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType()) coll_check = WorldMeshCollision(coll_cfg) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(-1, 1, 1, 4) env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32) # create buffers: @@ -184,7 +189,8 @@ def test_swept_world_primitive_mesh_instance(): w_obj_pose = Pose.from_list([0, 0, 1, 1, 0, 0, 0], tensor_args) coll_check.add_obb_from_raw("cube_1", dims, 0, w_obj_pose=w_obj_pose) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, 1, -1, 4) env_query_idx = None env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32) diff --git a/tests/interpolation_test.py b/tests/interpolation_test.py index 7f38709..11fe94d 100644 --- a/tests/interpolation_test.py +++ b/tests/interpolation_test.py @@ -19,10 +19,11 @@ from curobo.util.trajectory import InterpolateType, get_batch_interpolated_traje def test_linear_interpolation(): - b, h, dof = 1, 24, 1 - raw_dt = 0.4 - int_dt = 0.01 tensor_args = TensorDeviceType() + + b, h, dof = 1, 24, 1 + raw_dt = tensor_args.to_device(0.4) + int_dt = 0.01 # initialize raw trajectory: in_traj = JointState.zeros((b, h, dof), tensor_args) in_traj.position = torch.zeros((b, h, dof), device=tensor_args.device) @@ -62,6 +63,3 @@ def test_linear_interpolation(): ).item() < 0.05 ) - - -# test_linear_interpolation() diff --git a/tests/kinematics_test.py b/tests/kinematics_test.py index 14b7386..c193ea8 100644 --- a/tests/kinematics_test.py +++ b/tests/kinematics_test.py @@ -61,11 +61,15 @@ def test_franka_kinematics(cfg): tensor_args = TensorDeviceType() robot_model = CudaRobotModel(cfg) - q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1) - ee_position = torch.as_tensor([6.0860e-02, -4.7547e-12, 7.6373e-01], **vars(tensor_args)).view( - 1, -1 - ) - ee_quat = torch.as_tensor([0.0382, 0.9193, 0.3808, 0.0922], **vars(tensor_args)).view(1, -1) + q_test = torch.as_tensor( + [0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **(tensor_args.as_torch_dict()) + ).view(1, -1) + ee_position = torch.as_tensor( + [6.0860e-02, -4.7547e-12, 7.6373e-01], **(tensor_args.as_torch_dict()) + ).view(1, -1) + ee_quat = torch.as_tensor( + [0.0382, 0.9193, 0.3808, 0.0922], **(tensor_args.as_torch_dict()) + ).view(1, -1) b_list = [1, 10, 100, 5000][:1] for b in b_list: state = robot_model.get_state(q_test.repeat(b, 1).clone()) @@ -80,7 +84,9 @@ def test_franka_attached_object_kinematics(cfg): tensor_args = TensorDeviceType() robot_model = CudaRobotModel(cfg) - q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1) + q_test = torch.as_tensor( + [0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **(tensor_args.as_torch_dict()) + ).view(1, -1) sph_idx = robot_model.kinematics_config.get_sphere_index_from_link_name("attached_object") state = robot_model.get_state(q_test.clone()) @@ -91,7 +97,7 @@ def test_franka_attached_object_kinematics(cfg): assert torch.norm(attached_spheres[:, :, 1]) < 0.0001 # attach an object: - new_object = torch.zeros((2, 4), **vars(tensor_args)) + new_object = torch.zeros((2, 4), **(tensor_args.as_torch_dict())) new_object[:, 3] = 0.01 new_object[:, 1] = 0.1 robot_model.kinematics_config.update_link_spheres("attached_object", new_object) @@ -106,7 +112,9 @@ def test_franka_attach_object_kinematics(cfg): tensor_args = TensorDeviceType() robot_model = CudaRobotModel(cfg) - q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1) + q_test = torch.as_tensor( + [0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **(tensor_args.as_torch_dict()) + ).view(1, -1) sph_idx = robot_model.kinematics_config.get_sphere_index_from_link_name("attached_object") state = robot_model.get_state(q_test.clone()) @@ -117,7 +125,7 @@ def test_franka_attach_object_kinematics(cfg): assert torch.norm(attached_spheres[:, :, 1]) < 0.0001 # attach an object: - new_object = torch.zeros((4, 4), **vars(tensor_args)) + new_object = torch.zeros((4, 4), **(tensor_args.as_torch_dict())) new_object[:2, 3] = 0.01 new_object[:2, 1] = 0.1 # print(attached_spheres[:, sph_idx,:].shape) @@ -175,7 +183,9 @@ def test_franka_toggle_link_collision(cfg): tensor_args = TensorDeviceType() robot_model = CudaRobotModel(cfg) - q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1) + q_test = torch.as_tensor( + [0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **(tensor_args.as_torch_dict()) + ).view(1, -1) sph_idx = robot_model.kinematics_config.get_sphere_index_from_link_name("panda_link5") state = robot_model.get_state(q_test.clone()) diff --git a/tests/motion_gen_module_test.py b/tests/motion_gen_module_test.py index de8e467..7d2a97e 100644 --- a/tests/motion_gen_module_test.py +++ b/tests/motion_gen_module_test.py @@ -327,4 +327,4 @@ def test_motion_gen_single_js(motion_gen_str, enable_graph, request): reached_state = result.optimized_plan[-1] - assert torch.norm(goal_state.position - reached_state.position) < 0.005 + assert torch.norm(goal_state.position - reached_state.position) < 0.05 diff --git a/tests/nvblox_test.py b/tests/nvblox_test.py index 92f1551..f6da8c4 100644 --- a/tests/nvblox_test.py +++ b/tests/nvblox_test.py @@ -17,9 +17,11 @@ import pytest import torch # CuRobo -from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollisionConfig -from curobo.geom.types import Cuboid, WorldConfig +from curobo.geom.sdf.world import CollisionCheckerType, CollisionQueryBuffer, WorldCollisionConfig +from curobo.geom.types import BloxMap, Cuboid, WorldConfig from curobo.types.base import TensorDeviceType +from curobo.types.camera import CameraObservation +from curobo.types.math import Pose from curobo.util_file import get_world_configs_path, join_path, load_yaml try: @@ -41,7 +43,8 @@ def test_world_blox_trajectory(): coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType()) coll_check = WorldBloxCollision(coll_cfg) x_sph = torch.as_tensor( - [[0.1, 0.2, 0.0, 10.5], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.1, 0.2, 0.0, 10.5], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, -1, 1, 4) # create buffers: query_buffer = CollisionQueryBuffer.initialize_from_shape( @@ -66,7 +69,8 @@ def test_world_blox(): coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType()) coll_check = WorldBloxCollision(coll_cfg) x_sph = torch.as_tensor( - [[0.1, 0.2, 0.0, 10.5], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.1, 0.2, 0.0, 10.5], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, 1, -1, 4) # create buffers: query_buffer = CollisionQueryBuffer.initialize_from_shape( @@ -116,15 +120,12 @@ def test_nvblox_world_from_primitive_world(): data_dict = load_yaml(join_path(get_world_configs_path(), world_file)) world_cfg = WorldConfig.from_dict(data_dict).get_mesh_world(True) mesh = world_cfg.mesh[0].get_trimesh_mesh() - # world_cfg.mesh[0].save_as_mesh("world.obj") # Third Party - from nvblox_torch.datasets.mesh_dataset import MeshDataset - - # CuRobo - from curobo.geom.sdf.world import CollisionCheckerType - from curobo.geom.types import BloxMap - from curobo.types.camera import CameraObservation - from curobo.types.math import Pose + try: + # Third Party + from nvblox_torch.datasets.mesh_dataset import MeshDataset + except ImportError: + pytest.skip("pyrender and scikit-image is not installed") # create a nvblox collision checker: world_config = WorldConfig( diff --git a/tests/pose_reaching_test.py b/tests/pose_reaching_test.py index ef34588..9fa86f2 100644 --- a/tests/pose_reaching_test.py +++ b/tests/pose_reaching_test.py @@ -21,9 +21,7 @@ from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGen "parallel_finetune, force_graph, expected_motion_time", [ (True, False, 12), - (False, False, 12), (True, True, 12), - (False, True, 12), ], ) def test_pose_sequence_ur5e(parallel_finetune, force_graph, expected_motion_time): @@ -63,7 +61,7 @@ def test_pose_sequence_ur5e(parallel_finetune, force_graph, expected_motion_time start_state.clone(), goal_pose, plan_config=MotionGenPlanConfig( - parallel_finetune=parallel_finetune, max_attempts=1, enable_graph=force_graph + parallel_finetune=parallel_finetune, max_attempts=10, enable_graph=force_graph ), ) if result.success.item(): diff --git a/tests/robot_segmentation_test.py b/tests/robot_segmentation_test.py new file mode 100644 index 0000000..97d0a32 --- /dev/null +++ b/tests/robot_segmentation_test.py @@ -0,0 +1,209 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Third Party +import pytest +import torch + +# CuRobo +from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel +from curobo.geom.types import PointCloud, WorldConfig +from curobo.types.base import TensorDeviceType +from curobo.types.camera import CameraObservation +from curobo.types.math import Pose +from curobo.types.robot import RobotConfig +from curobo.types.state import JointState +from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml +from curobo.wrap.model.robot_segmenter import RobotSegmenter + +torch.manual_seed(30) + +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True + +try: + # Third Party + from nvblox_torch.datasets.mesh_dataset import MeshDataset + +except ImportError: + pytest.skip( + "Nvblox Torch is not available or pyrender is not installed", allow_module_level=True + ) + + +def create_render_dataset( + robot_file, + fov_deg: float = 60, + n_frames: int = 20, + retract_delta: float = 0.0, +): + # load robot: + robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file)) + robot_dict["robot_cfg"]["kinematics"]["load_link_names_with_mesh"] = True + robot_dict["robot_cfg"]["kinematics"]["load_meshes"] = True + + robot_cfg = RobotConfig.from_dict(robot_dict["robot_cfg"]) + + kin_model = CudaRobotModel(robot_cfg.kinematics) + + q = kin_model.retract_config + + q += retract_delta + + meshes = kin_model.get_robot_as_mesh(q) + + world = WorldConfig(mesh=meshes[:]) + world_table = WorldConfig.from_dict( + load_yaml(join_path(get_world_configs_path(), "collision_table.yml")) + ) + world_table.cuboid[0].dims = [0.5, 0.5, 0.1] + world.add_obstacle(world_table.objects[0]) + robot_mesh = ( + WorldConfig.create_merged_mesh_world(world, process_color=False).mesh[0].get_trimesh_mesh() + ) + + mesh_dataset = MeshDataset( + None, + n_frames=n_frames, + image_size=480, + save_data_dir=None, + trimesh_mesh=robot_mesh, + fov_deg=fov_deg, + ) + q_js = JointState(position=q, joint_names=kin_model.joint_names) + + return mesh_dataset, q_js + + +@pytest.mark.parametrize( + "robot_file", + ["iiwa.yml", "iiwa_allegro.yml", "franka.yml", "ur10e.yml", "ur5e.yml"], +) +def test_mask_image(robot_file): + # create robot segmenter: + tensor_args = TensorDeviceType() + + curobo_segmenter = RobotSegmenter.from_robot_file( + robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True + ) + + mesh_dataset, q_js = create_render_dataset(robot_file, n_frames=5) + + for i in range(len(mesh_dataset)): + + data = mesh_dataset[i] + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + if not curobo_segmenter.ready: + curobo_segmenter.update_camera_projection(cam_obs) + + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + if torch.count_nonzero(depth_mask) > 100: + return + assert False + + +@pytest.mark.parametrize( + "robot_file", + ["iiwa.yml", "iiwa_allegro.yml", "franka.yml", "ur10e.yml", "ur5e.yml"], +) +def test_batch_mask_image(robot_file): + # create robot segmenter: + tensor_args = TensorDeviceType() + + curobo_segmenter = RobotSegmenter.from_robot_file( + robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True + ) + + mesh_dataset, q_js = create_render_dataset(robot_file, n_frames=5) + mesh_dataset_zoom, q_js = create_render_dataset(robot_file, fov_deg=40, n_frames=5) + + for i in range(len(mesh_dataset)): + + data = mesh_dataset[i] + data_zoom = mesh_dataset_zoom[i] + + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + cam_obs_zoom = CameraObservation( + depth_image=tensor_args.to_device(data_zoom["depth"]) * 1000, + intrinsics=data_zoom["intrinsics"], + pose=Pose.from_matrix(data_zoom["pose"].to(device=tensor_args.device)), + ) + cam_obs = cam_obs.stack(cam_obs_zoom) + if not curobo_segmenter.ready: + curobo_segmenter.update_camera_projection(cam_obs) + + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + if torch.count_nonzero(depth_mask[0]) > 100 and (torch.count_nonzero(depth_mask[1]) > 100): + return + assert False + + +@pytest.mark.parametrize( + "robot_file", + ["iiwa.yml", "iiwa_allegro.yml", "franka.yml", "ur10e.yml", "ur5e.yml"], +) +def test_batch_robot_mask_image(robot_file): + # create robot segmenter: + tensor_args = TensorDeviceType() + + curobo_segmenter = RobotSegmenter.from_robot_file( + robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True + ) + + mesh_dataset, q_js = create_render_dataset(robot_file, n_frames=5) + mesh_dataset_zoom, q_js_zoom = create_render_dataset( + robot_file, + fov_deg=40, + n_frames=5, + retract_delta=0.4, + ) + q_js = q_js.unsqueeze(0).stack(q_js_zoom.unsqueeze(0)) + + for i in range(len(mesh_dataset)): + + data = mesh_dataset[i] + data_zoom = mesh_dataset_zoom[i] + + cam_obs = CameraObservation( + depth_image=tensor_args.to_device(data["depth"]) * 1000, + intrinsics=data["intrinsics"], + pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)), + ) + cam_obs_zoom = CameraObservation( + depth_image=tensor_args.to_device(data_zoom["depth"]) * 1000, + intrinsics=data_zoom["intrinsics"], + pose=Pose.from_matrix(data_zoom["pose"].to(device=tensor_args.device)), + ) + cam_obs = cam_obs.stack(cam_obs_zoom) + if not curobo_segmenter.ready: + curobo_segmenter.update_camera_projection(cam_obs) + + depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js( + cam_obs, + q_js, + ) + if torch.count_nonzero(depth_mask[0]) > 100 and (torch.count_nonzero(depth_mask[1]) > 100): + return + assert False diff --git a/tests/voxel_collision_test.py b/tests/voxel_collision_test.py new file mode 100644 index 0000000..ddf3a3d --- /dev/null +++ b/tests/voxel_collision_test.py @@ -0,0 +1,254 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Third Party +import pytest +import torch + +# CuRobo +from curobo.geom.sdf.world import ( + CollisionCheckerType, + CollisionQueryBuffer, + WorldCollisionConfig, + WorldPrimitiveCollision, +) +from curobo.geom.sdf.world_mesh import WorldMeshCollision +from curobo.geom.sdf.world_voxel import WorldVoxelCollision +from curobo.geom.types import Cuboid, VoxelGrid, WorldConfig +from curobo.types.base import TensorDeviceType + + +def get_world_model(single_object: bool = False): + if single_object: + + world_model = WorldConfig.from_dict( + { + "cuboid": { + "block1": {"dims": [0.1, 0.2, 0.5], "pose": [0.25, 0.1, 0, 1, 0, 0, 0]}, + } + } + ) + else: + world_model = WorldConfig.from_dict( + { + "cuboid": { + "block": {"dims": [0.5, 0.5, 0.5], "pose": [-0.25, 0, 0, 1, 0, 0, 0]}, + "block1": {"dims": [0.1, 0.2, 0.5], "pose": [0.25, 0.1, 0, 1, 0, 0, 0]}, + } + } + ) + return world_model + + +@pytest.fixture(scope="function") +def world_collision(request): + world_model = get_world_model(request.param[1]) + + if request.param[0]: + world_model = world_model.get_mesh_world() + tensor_args = TensorDeviceType() + world_collision_config = WorldCollisionConfig.load_from_dict( + { + "checker_type": ( + CollisionCheckerType.PRIMITIVE + if not request.param[0] + else CollisionCheckerType.MESH + ), + "max_distance": 5.0, + "n_envs": 1, + }, + world_model, + tensor_args, + ) + if request.param[0]: + world_collision = WorldMeshCollision(world_collision_config) + else: + world_collision = WorldPrimitiveCollision(world_collision_config) + + return world_collision + + +def world_voxel_collision_checker(): + world_model = { + "voxel": { + "base": {"dims": [2.0, 2.0, 2.0], "pose": [0, 0, 0, 1, 0, 0, 0], "voxel_size": 0.05}, + } + } + tensor_args = TensorDeviceType() + world_collision_config = WorldCollisionConfig.load_from_dict( + { + "checker_type": CollisionCheckerType.VOXEL, + "max_distance": 5.0, + "n_envs": 1, + }, + world_model, + tensor_args, + ) + world_collision = WorldVoxelCollision(world_collision_config) + return world_collision + + +@pytest.mark.parametrize( + "world_collision", + [ + ([True, True]), + ([False, True]), + ([True, False]), + ([False, False]), + ], + indirect=True, +) +def test_voxel_esdf(world_collision): + + # create voxel collision checker + world_voxel_collision = world_voxel_collision_checker() + + voxel_grid = world_voxel_collision.get_voxel_grid("base") + esdf = world_collision.get_esdf_in_bounding_box( + Cuboid(name="base", pose=voxel_grid.pose, dims=voxel_grid.dims), + voxel_size=voxel_grid.voxel_size, + ) + + world_voxel_collision.update_voxel_data(esdf) + + voxel_size = 0.01 + esdf = world_collision.get_esdf_in_bounding_box( + Cuboid(name="base", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]), voxel_size=voxel_size + ) + + esdf_voxel = world_voxel_collision.get_esdf_in_bounding_box( + Cuboid(name="base", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]), voxel_size=voxel_size + ) + esdf_data = esdf.feature_tensor + esdf_voxel_data = esdf_voxel.feature_tensor + esdf_data[esdf_data < -1.0] = 0.0 + esdf_voxel_data[esdf_voxel_data < -1.0] = 0.0 + error = torch.abs(esdf_data - esdf_voxel_data) + + assert torch.max(error) < 2 * voxel_grid.voxel_size + + +@pytest.mark.parametrize( + "world_collision", + [ + ([True, True]), + ([False, True]), + ([True, False]), + ([False, False]), + ], + indirect=True, +) +def test_primitive_voxel_sphere_distance(world_collision): + tensor_args = TensorDeviceType() + voxel_size = 0.025 + world_voxel_collision = world_voxel_collision_checker() + + voxel_grid = world_voxel_collision.get_voxel_grid("base") + esdf = world_collision.get_esdf_in_bounding_box( + Cuboid(name="base", pose=voxel_grid.pose, dims=voxel_grid.dims), + voxel_size=voxel_grid.voxel_size, + ) + + world_voxel_collision.update_voxel_data(esdf) + + # create a grid and compute distance: + sample_grid = VoxelGrid( + name="test", pose=[0, 0, 0, 1, 0, 0, 0], voxel_size=voxel_size, dims=[1, 1, 1] + ) + sample_spheres = sample_grid.create_xyzr_tensor() + sample_spheres = sample_spheres.reshape(-1, 1, 1, 4) + + act_distance = tensor_args.to_device([0.0]) + + weight = tensor_args.to_device([1.0]) + cuboid_collision_buffer = CollisionQueryBuffer.initialize_from_shape( + sample_spheres.shape, tensor_args, world_collision.collision_types + ) + voxel_collision_buffer = CollisionQueryBuffer.initialize_from_shape( + sample_spheres.shape, tensor_args, world_voxel_collision.collision_types + ) + d_cuboid = world_collision.get_sphere_distance( + sample_spheres, cuboid_collision_buffer, weight, act_distance + ) + d_voxel = world_voxel_collision.get_sphere_distance( + sample_spheres, voxel_collision_buffer, weight, act_distance + ) + + error = torch.abs(d_cuboid.view(-1) - d_voxel.view(-1)) + + assert torch.max(error) < voxel_grid.voxel_size + + +@pytest.mark.parametrize( + "world_collision", + [ + # ([True, True]), + ([False, True]), + # ([True, False]), + # ([False, False]), + ], + indirect=True, +) +def test_primitive_voxel_sphere_gradient(world_collision): + tensor_args = TensorDeviceType() + world_voxel_collision = world_voxel_collision_checker() + + voxel_grid = world_voxel_collision.get_voxel_grid("base") + esdf = world_collision.get_esdf_in_bounding_box( + Cuboid(name="base", pose=voxel_grid.pose, dims=voxel_grid.dims), + voxel_size=voxel_grid.voxel_size, + ) + voxel_size = voxel_grid.voxel_size + + world_voxel_collision.update_voxel_data(esdf) + + # create a grid and compute distance: + sample_grid = VoxelGrid( + name="test", pose=[0.0, 0.0, 0, 1, 0, 0, 0], voxel_size=voxel_size, dims=[0.1, 0.1, 0.1] + ) + # sample_grid = VoxelGrid( + # name="test", pose=[0.2, 0.0, 0, 1, 0, 0, 0], voxel_size=voxel_size, + # dims=[0.1, 0.1, 0.1] + # ) + sample_spheres = sample_grid.create_xyzr_tensor(transform_to_origin=True) + sample_spheres = sample_spheres.reshape(-1, 1, 1, 4) + + act_distance = tensor_args.to_device([0.0]) + + weight = tensor_args.to_device([1.0]) + cuboid_collision_buffer = CollisionQueryBuffer.initialize_from_shape( + sample_spheres.shape, tensor_args, world_collision.collision_types + ) + voxel_collision_buffer = CollisionQueryBuffer.initialize_from_shape( + sample_spheres.shape, tensor_args, world_voxel_collision.collision_types + ) + sample_spheres_1 = sample_spheres.clone() + sample_spheres_1.requires_grad = True + d_cuboid = world_collision.get_sphere_distance( + sample_spheres_1, cuboid_collision_buffer, weight, act_distance + ) + sample_spheres_2 = sample_spheres.clone() + sample_spheres_2.requires_grad = True + d_voxel = world_voxel_collision.get_sphere_distance( + sample_spheres_2, voxel_collision_buffer, weight, act_distance + ) + + error = torch.abs(d_cuboid.view(-1) - d_voxel.view(-1)) + + assert torch.max(error) < voxel_grid.voxel_size + + cuboid_gradient = cuboid_collision_buffer.get_gradient_buffer() + + voxel_gradient = voxel_collision_buffer.get_gradient_buffer() + error = torch.linalg.norm(cuboid_gradient - voxel_gradient, dim=-1) + print(cuboid_gradient) + print(voxel_gradient) + assert torch.max(error) < voxel_grid.voxel_size diff --git a/tests/voxelization_test.py b/tests/voxelization_test.py new file mode 100644 index 0000000..336c94a --- /dev/null +++ b/tests/voxelization_test.py @@ -0,0 +1,181 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Third Party +import pytest + +# CuRobo +from curobo.geom.sdf.world import ( + CollisionCheckerType, + WorldCollisionConfig, + WorldPrimitiveCollision, +) +from curobo.geom.sdf.world_mesh import WorldMeshCollision +from curobo.geom.types import Mesh, WorldConfig +from curobo.types.base import TensorDeviceType + + +def get_world_model(single_object: bool = False): + if single_object: + + world_model = WorldConfig.from_dict( + { + "cuboid": { + "block1": {"dims": [0.1, 0.2, 0.5], "pose": [0.25, 0.1, 0, 1, 0, 0, 0]}, + } + } + ) + else: + world_model = WorldConfig.from_dict( + { + "cuboid": { + "block": {"dims": [0.5, 0.5, 0.5], "pose": [-0.25, 0, 0, 1, 0, 0, 0]}, + "block1": {"dims": [0.1, 0.2, 0.5], "pose": [0.25, 0.1, 0, 1, 0, 0, 0]}, + } + } + ) + return world_model + + +@pytest.fixture(scope="function") +def world_collision(request): + world_model = get_world_model(request.param[1]) + + if request.param[0]: + world_model = world_model.get_mesh_world() + tensor_args = TensorDeviceType() + world_collision_config = WorldCollisionConfig.load_from_dict( + { + "checker_type": ( + CollisionCheckerType.PRIMITIVE + if not request.param[0] + else CollisionCheckerType.MESH + ), + "max_distance": 5.0, + "n_envs": 1, + }, + world_model, + tensor_args, + ) + if request.param[0]: + world_collision = WorldMeshCollision(world_collision_config) + else: + world_collision = WorldPrimitiveCollision(world_collision_config) + + return world_collision + + +@pytest.fixture(scope="function") +def world_collision_primitive(): + world_model = get_world_model(True) + + tensor_args = TensorDeviceType() + world_collision_config = WorldCollisionConfig.load_from_dict( + { + "checker_type": (CollisionCheckerType.PRIMITIVE), + "max_distance": 5.0, + "n_envs": 1, + }, + world_model, + tensor_args, + ) + + world_collision = WorldPrimitiveCollision(world_collision_config) + + return world_collision + + +@pytest.mark.parametrize( + "world_collision", + [ + (True, True), + (False, True), + ], + indirect=True, +) +def test_voxels_from_world(world_collision): + voxel_size = 0.1 + voxels = world_collision.get_voxels_in_bounding_box(voxel_size=voxel_size) + assert voxels.shape[0] > 4 + + +@pytest.mark.skip(reason="Not ready yet.") +@pytest.mark.parametrize( + "world_collision", + [ + (True, True), + (False, True), + ], + indirect=True, +) +def test_esdf_from_world(world_collision): + voxel_size = 0.02 + voxels = world_collision.get_voxels_in_bounding_box(voxel_size=voxel_size).clone() + world_collision.clear_voxelization_cache() + esdf = world_collision.get_esdf_in_bounding_box(voxel_size=voxel_size).clone() + + occupied = esdf.get_occupied_voxels() + + assert voxels.shape == occupied.shape + + +@pytest.mark.parametrize( + "world_collision", + [ + (True, True), + (False, True), + ], + indirect=True, +) +def test_voxels_prim_mesh(world_collision, world_collision_primitive): + voxel_size = 0.1 + voxels = world_collision.get_voxels_in_bounding_box(voxel_size=voxel_size).clone() + voxels_prim = world_collision_primitive.get_voxels_in_bounding_box( + voxel_size=voxel_size + ).clone() + assert voxels.shape == voxels_prim.shape + + +@pytest.mark.parametrize( + "world_collision", + [ + (True, True), + # (False, True), + ], + indirect=True, +) +def test_esdf_prim_mesh(world_collision, world_collision_primitive): + voxel_size = 0.1 + esdf = world_collision.get_esdf_in_bounding_box(voxel_size=voxel_size).clone() + esdf_prim = world_collision_primitive.get_esdf_in_bounding_box(voxel_size=voxel_size).clone() + print(esdf.voxel_size, esdf_prim.voxel_size) + voxels = esdf.get_occupied_voxels() + voxels_prim = esdf_prim.get_occupied_voxels() + assert voxels.shape == voxels_prim.shape + + +@pytest.mark.parametrize( + "world_collision", + [ + (True, True), + (False, True), + (True, False), + (False, False), + ], + indirect=True, +) +def test_marching_cubes_from_world(world_collision): + voxel_size = 0.1 + voxels = world_collision.get_voxels_in_bounding_box(voxel_size=voxel_size) + mesh = Mesh.from_pointcloud(voxels[:, :3].detach().cpu().numpy(), pitch=voxel_size * 0.1) + + mesh.save_as_mesh("test_" + str(len(voxels)) + ".stl") + assert len(mesh.vertices) > 100 # exact value is 240 diff --git a/tests/warp_mesh_test.py b/tests/warp_mesh_test.py index fa2bf5d..4b358c5 100644 --- a/tests/warp_mesh_test.py +++ b/tests/warp_mesh_test.py @@ -32,7 +32,7 @@ def test_sdf_pose(): new_mesh, env_idx=0, ) - query_spheres = torch.zeros((1, 1, 2, 4), **vars(tensor_args)) + query_spheres = torch.zeros((1, 1, 2, 4), **(tensor_args.as_torch_dict())) query_spheres[..., 2] = 10.0 query_spheres[..., 1, :] = 0.0 query_spheres[..., 3] = 1.0 @@ -59,7 +59,7 @@ def test_swept_sdf_speed_pose(): new_mesh, env_idx=0, ) - query_spheres = torch.zeros((1, 1, 2, 4), **vars(tensor_args)) + query_spheres = torch.zeros((1, 1, 2, 4), **(tensor_args.as_torch_dict())) query_spheres[..., 2] = 10.0 query_spheres[..., 1, :] = 0.0 query_spheres[..., 3] = 1.0 @@ -88,7 +88,7 @@ def test_swept_sdf_pose(): new_mesh, env_idx=0, ) - query_spheres = torch.zeros((1, 1, 2, 4), **vars(tensor_args)) + query_spheres = torch.zeros((1, 1, 2, 4), **(tensor_args.as_torch_dict())) query_spheres[..., 2] = 10.0 query_spheres[..., 1, :] = 0.0 query_spheres[..., 3] = 1.0 diff --git a/tests/world_config_test.py b/tests/world_config_test.py index d8e8f9f..4bf6216 100644 --- a/tests/world_config_test.py +++ b/tests/world_config_test.py @@ -91,7 +91,8 @@ def test_world_modify(): world_ccheck.update_obstacle_pose(name="cylinder_1", w_obj_pose=w_pose) x_sph = torch.as_tensor( - [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.1], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args) + [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.1], [0.01, 0.01, 0.0, 0.1]], + **(tensor_args.as_torch_dict()) ).view(1, 1, -1, 4) # create buffers: query_buffer = CollisionQueryBuffer.initialize_from_shape(