diff --git a/.gitignore b/.gitignore
index 640ce54..1bc38d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -200,4 +200,5 @@ nvidia_curobo*/
*tfevents*
-*.csv
\ No newline at end of file
+*.csv
+docs/*
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6ddb098..f9e37da 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,12 +9,62 @@ without an express license agreement from NVIDIA CORPORATION or
its affiliates is strictly prohibited.
-->
# Changelog
-## Latest Commit
-### BugFixes
-- refactored wp.index() instances to `[]` to avoid errors in using with warp-lang>=0.11.
-- Fixed bug in gaussian transformation to ensure values are not -1 or +1.
+
+## Version 0.6.3
+### Changes in default behavior
+- Increased default collision cache to 50 in RobotWorld.
+- Changed `CSpaceConfig.position_limit_clip` default to 0 as previous default of 0.01 can make
+default start state in examples be out of bounds.
+- MotionGen uses parallel_finetune by default. To get previous motion gen behavior, pass
+`warmup(parallel_finetune=False)` and `MotionGenPlanConfig(parallel_finetune=False)`.
+- MotionGen loads Mesh Collision checker instead of Primitive by default.
+
+### Breaking Changes
+- Renamed `copy_if_not_none` to `clone_if_not_none` to be more descriptive. Now `copy_if_not_none`
+will try to copy data into reference.
+- Renamed `n_envs` in curobo.opt module to avoid confusion between parallel environments and
+parallel problems in optimization.
+- Added more inputs to pose distance kernels. Check `curobolib/geom.py`.
+- Pose cost `run_vec_weight` should now be `[0,0,0,0,0,0]` instead of `[1,1,1,1,1,1]`
+
+### New Features
+- Add function to disable and enable collision for specific links in KinematicsTensorConfig.
+- Add goal index to reacher results to return index of goal reached when goalset planning.
+- Add locked joint state update api in MotionGen class.
+- Add goalset warmup padding to handle varied number of goals during goalset planning and also when
+calling plan_single after warmup of goalset.
+- Add new trajopt config to allow for smooth solutions at slow speeds (`velocity_scale<=0.25`). Also
+add error when `velocity_scale<0.1`.
+- Add experimental robot image segmentation module to enable robot removal in depth images.
+- Add constrained planning mode to motion_gen.
+
+### BugFixes & Misc.
+- refactored wp.index() instances to `[]` to avoid errors in future releases of warp.
+- Fix bug in gaussian transformation to ensure values are not -1 or +1.
+- Fix bug in ik_solver loading ee_link_name from argument.
+- Fix bug in batch_goalset planning, where pose cost was selected as GOALSET instead of
+BATCH_GOALSET.
+- Added package data to also export `.so` files.
- Fixed bug in transforming link visual mesh offset when reading from urdf.
- Fixed bug in MotionGenPlanConfig.clone() that didn't clone the state of parallel_finetune.
+- Increased weighting from 1.0 to 5.0 for optimized_dt in TrajEvaluator to select shorter
+trajectories.
+- Improved determinism by setting global seed for random in `graph_nx.py`.
+- Added option to clear obstacles in WorldPrimitiveCollision.
+- Raise error when reference of tensors change in MotionGen, IKSolver, and TrajOpt when cuda graph
+is enabled.
+- plan_single will get converted to plan_goalset when a plan_goalset was used to initialize cuda
+graph.
+- plan_goalset will pad for extra goals when called with less number of goal than initial creation.
+- Improved API documentation for Optimizer class.
+- Improved benchmark timings, now within 15ms of results reported in technical report. Added
+numbers to benchmark [webpage](https://curobo.org/source/getting_started/4_benchmarks.html) for
+easy reference.
+- Set `use_cuda_graph` to `True` as default from `None` in `MotionGenConfig.load_from_robot_config`
+
+### Known Bugs (WIP)
+- Examples don't run in Isaac Sim 2023.1.1 due to behavior change in urdf importer.
+
## Version 0.6.2
### New Features
- Added support for actuated axis to be negative (i.e., urdf joints with `` are
@@ -35,6 +85,7 @@ Run `benchmark/ik_benchmark.py` to get the latest results.
- Added `external_asset_path` to robot configuration to help in loading urdf and meshes from an
external directory.
+
### BugFixes & Misc.
- Update nvblox wrappers to work with v0.0.5 without segfaults. Significantly improves stability.
- Remove mimic joints in franka panda to maintain compatibility with Isaac Sim 2023.1.0 and 2022.2.1
@@ -60,6 +111,7 @@ planning time, 20ms slower on 4090 and 40ms on ORIN MAXN. We will address this s
release. One way to avoid this regression is to set `finetune_dt_scale=1.05` in
`MotionGenConfig.load_from_robot_config()`.
+
## Version 0.6.1
- Added changes to `examples/isaac_sim` to support Isaac Sim 2023.1.0
diff --git a/MANIFEST.in b/MANIFEST.in
index 2f36d88..0ee060b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -15,4 +15,5 @@
# graft
graft src/curobo/content
-global-include py.typed
\ No newline at end of file
+global-include py.typed
+graft src/curobo/curobolib/*.so
\ No newline at end of file
diff --git a/benchmark/README.md b/benchmark/README.md
index f1e39dc..77c4937 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -12,10 +12,47 @@ This folder contains scripts to run the motion planning benchmarks.
Refer to Benchmarks & Profiling instructions: https://curobo.org/source/getting_started/4_benchmarks.html.
-### Note
-
Results in the arxiv paper were obtained from v0.6.0.
-v0.6.2+ has significant changes to improve motion quality with lower motion time, lower path length, higher pose accuracy (<1mm). v0.6.2+ sacrifices 30ms (RTX 4090) of compute time to achieve signficantly better solutions. The new results are yet to be added to the technical report. For now, to get the latest benchmark results follow instructions here: https://curobo.org/source/getting_started/4_benchmarks.html.
+v0.6.2+ has significant changes to improve motion quality with lower motion time, lower path length, higher pose accuracy (<1mm). v0.6.2+ sacrifices 15ms (RTX 4090) of compute time to achieve signficantly better solutions. The new results are yet to be added to the technical report. For now, to get the latest benchmark results follow instructions here: https://curobo.org/source/getting_started/4_benchmarks.html.
-To get results similar to in the technical report, pass `--report_edition` to `curobo_benchmark.py`.
\ No newline at end of file
+To get results similar to in the technical report, pass `--report_edition` to `curobo_benchmark.py`.
+
+
+# Latest Results (Feb 2024)
+
+Motion Generation on 2600 problems from motion benchmaker and motion policy networks, gives the
+following values on a RTX 4090:
+
+| Metric | Value |
+|-------------------|--------------------------------------------------------------|
+|Success % | 99.84 |
+|Plan Time (s) | mean: 0.068 ± 0.158 median:0.042 75%: 0.055 98%: 0.246 |
+|Motion Time (s) | mean: 1.169 ± 0.360 median:1.140 75%: 1.381 98%: 2.163 |
+|Path Length (rad.) | mean: 3.177 ± 1.072 median:3.261 75%: 3.804 98%: 5.376 |
+|Jerk | mean: 97.700 ± 48.630 median:88.993 75%: 126.092 98%: 199.540|
+|Position Error (mm)| mean: 0.119 ± 0.341 median:0.027 75%: 0.091 98%: 1.039 |
+
+
+## Motion Benchmaker (800 problems):
+
+| Metric | Value |
+|-------------------|--------------------------------------------------------------|
+|Success % | 100 |
+|Plan Time (s) | mean: 0.063 ± 0.137 median:0.042 75%: 0.044 98%: 0.206 |
+|Motion Time (s) | mean: 1.429 ± 0.330 median:1.392 75%: 1.501 98%: 2.473 |
+|Path Length (rad.) | mean: 3.956 ± 0.783 median:3.755 75%: 4.352 98%: 6.041 |
+|Jerk | mean: 67.284 ± 27.788 median:61.853 75%: 83.337 98%: 143.118 |
+|Position Error (mm)| mean: 0.079 ± 0.139 median:0.032 75%: 0.077 98%: 0.472 |
+
+
+## Motion Policy Networks (1800 problems):
+
+| Metric | Value |
+|-------------------|-----------------------------------------------------------------|
+|Success % | 99.77 |
+|Plan Time (s) | mean: 0.068 ± 0.117 median:0.042 75%: 0.059 98%: 0.243 |
+|Motion Time (s) | mean: 1.051 ± 0.306 median:1.016 75%: 1.226 98%: 1.760 |
+|Path Length (rad.) | mean: 2.829 ± 1.000 median:2.837 75%: 3.482 98%: 4.905 |
+|Jerk | mean: 110.610 ± 47.586 median:105.271 75%: 141.517 98%: 217.158 |
+|Position Error (mm)| mean: 0.122 ± 0.343 median:0.024 75%: 0.095 98%: 1.114 |
diff --git a/benchmark/curobo_benchmark.py b/benchmark/curobo_benchmark.py
index d3bd0c7..944282b 100644
--- a/benchmark/curobo_benchmark.py
+++ b/benchmark/curobo_benchmark.py
@@ -10,12 +10,15 @@
#
# Standard Library
+import argparse
+import random
from copy import deepcopy
from typing import Optional
# Third Party
import numpy as np
import torch
+from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
from tqdm import tqdm
# CuRobo
@@ -26,6 +29,7 @@ from curobo.types.math import Pose
from curobo.types.robot import RobotConfig
from curobo.types.state import JointState
from curobo.util.logger import setup_curobo_logger
+from curobo.util.metrics import CuroboGroupMetrics, CuroboMetrics
from curobo.util_file import (
get_assets_path,
get_robot_configs_path,
@@ -36,28 +40,16 @@ from curobo.util_file import (
)
from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
-# torch.set_num_threads(8)
-# torch.use_deterministic_algorithms(True)
-torch.manual_seed(0)
+# set seeds
+torch.manual_seed(2)
torch.backends.cudnn.benchmark = True
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
-# torch.backends.cuda.matmul.allow_tf32 = False
-# torch.backends.cudnn.allow_tf32 = False
-
-# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-np.random.seed(0)
-# Standard Library
-import argparse
-import warnings
-from typing import List, Optional
-
-# Third Party
-from metrics import CuroboGroupMetrics, CuroboMetrics
-from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
+np.random.seed(2)
+random.seed(2)
def plot_cost_iteration(cost: torch.Tensor, save_path="cost", title="", log_scale=False):
@@ -174,7 +166,7 @@ def load_curobo(
world_cfg = WorldConfig.from_dict(
load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
).get_obb_world()
- interpolation_steps = 2000
+ interpolation_steps = 1000
c_checker = CollisionCheckerType.PRIMITIVE
c_cache = {"obb": n_cubes}
if mesh_mode:
@@ -189,7 +181,7 @@ def load_curobo(
K = robot_cfg_instance.kinematics.kinematics_config.joint_limits
K.position[0, :] -= 0.2
K.position[1, :] += 0.2
- finetune_iters = None
+ finetune_iters = 300
grad_iters = None
if args.report_edition:
finetune_iters = 200
@@ -215,7 +207,7 @@ def load_curobo(
collision_activation_distance=collision_activation_distance,
trajopt_dt=0.25,
finetune_dt_scale=finetune_dt_scale,
- maximum_trajectory_dt=0.1,
+ maximum_trajectory_dt=0.16,
)
mg = MotionGen(motion_gen_config)
mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=parallel_finetune)
@@ -238,7 +230,7 @@ def benchmark_mb(
interpolation_dt = 0.02
# mpinets_data = True
# if mpinets_data:
- file_paths = [motion_benchmaker_raw, mpinets_raw][:] # [1:]
+ file_paths = [motion_benchmaker_raw, mpinets_raw][:]
if args.demo:
file_paths = [demo_raw]
@@ -269,13 +261,13 @@ def benchmark_mb(
if "cage_panda" in key:
trajopt_seeds = 16
finetune_dt_scale = 0.95
- collision_activation_distance = 0.01
parallel_finetune = True
if "table_under_pick_panda" in key:
- tsteps = 44
+ tsteps = 40
trajopt_seeds = 24
- finetune_dt_scale = 0.98
+ finetune_dt_scale = 0.95
parallel_finetune = True
+
if "table_pick_panda" in key:
collision_activation_distance = 0.005
@@ -297,15 +289,19 @@ def benchmark_mb(
"cubby_neutral_start",
"cubby_neutral_goal",
"dresser_neutral_start",
+ "dresser_neutral_goal",
"tabletop_task_oriented",
]:
collision_activation_distance = 0.005
if key in ["dresser_neutral_goal"]:
trajopt_seeds = 24 # 24
- tsteps = 36
- collision_activation_distance = 0.005
- scene_problems = problems[key]
+ # tsteps = 36
+ collision_activation_distance = 0.01
+ # trajopt_seeds = 12
+ scene_problems = problems[key] # [:10]
n_cubes = check_problems(scene_problems)
+ # print(n_cubes)
+ # continue
mg, robot_cfg = load_curobo(
n_cubes,
enable_debug,
@@ -327,11 +323,10 @@ def benchmark_mb(
for problem in tqdm(scene_problems, leave=False):
i += 1
if problem["collision_buffer_ik"] < 0.0:
- # print("collision_ik:", problem["collision_buffer_ik"])
continue
plan_config = MotionGenPlanConfig(
- max_attempts=100, # 100, # 00, # 00, # 100, # 00, # 000,#,00,#00, # 5000,
+ max_attempts=100,
enable_graph_attempt=1,
disable_graph_attempt=20,
enable_finetune_trajopt=True,
@@ -349,24 +344,13 @@ def benchmark_mb(
problem_name = "d_" + key + "_" + str(i)
# reset planner
- mg.reset(reset_seed=False)
+ mg.reset(reset_seed=True)
if args.mesh:
world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_mesh_world()
else:
world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_obb_world()
mg.world_coll_checker.clear_cache()
mg.update_world(world)
- # from curobo.geom.types import Cuboid as curobo_Cuboid
-
- # coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box(
- # curobo_Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1, 1, 1]),
- # voxel_size=0.01,
- # )
- #
- # coll_mesh.save_as_mesh(problem_name + "debug_curobo.obj")
- # exit()
- # continue
- # load obstacles
# run planner
start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
@@ -379,7 +363,6 @@ def benchmark_mb(
)
if result.status == "IK Fail":
ik_fail += 1
- # rint(plan_config.enable_graph, plan_config.enable_graph_attempt)
problem["solution"] = None
problem_name = key + "_" + str(i)
@@ -432,7 +415,6 @@ def benchmark_mb(
log_scale=False,
)
if result.success.item():
- # print("GT: ", result.graph_time)
q_traj = result.get_interpolated_plan()
problem["goal_ik"] = q_traj.position.cpu().squeeze().numpy()[-1, :].tolist()
problem["solution"] = {
@@ -458,8 +440,7 @@ def benchmark_mb(
.tolist(),
"dt": interpolation_dt,
}
- # print(problem["solution"]["position"])
- # exit()
+
debug = {
"used_graph": result.used_graph,
"attempts": result.attempts,
@@ -487,14 +468,7 @@ def benchmark_mb(
"valid_query": result.valid_query,
}
problem["solution_debug"] = debug
- # print(
- # "T: ",
- # result.motion_time.item(),
- # result.optimized_dt.item(),
- # (len(problem["solution"]["position"]) - 1 ) * result.interpolation_dt,
- # result.interpolation_dt,
- # )
- # exit()
+
reached_pose = mg.compute_kinematics(result.optimized_plan[-1]).ee_pose
rot_error = goal_pose.angular_distance(reached_pose) * 100.0
if args.graph:
@@ -526,6 +500,7 @@ def benchmark_mb(
motion_time=result.motion_time.item(),
solve_time=solve_time,
cspace_path_length=path_length,
+ jerk=torch.max(torch.abs(result.optimized_plan.jerk)).item(),
)
if write_usd:
@@ -552,20 +527,9 @@ def benchmark_mb(
plot_traj(
result.optimized_plan,
result.optimized_dt.item(),
- # result.get_interpolated_plan(),
- # result.interpolation_dt,
title=problem_name,
save_path=join_path("benchmark/log/plot/", problem_name + ".png"),
)
- # plot_traj(
- # # result.optimized_plan,
- # # result.optimized_dt.item(),
- # result.get_interpolated_plan(),
- # result.interpolation_dt,
- # title=problem_name,
- # save_path=join_path("benchmark/log/plot/", problem_name + "_int.pdf"),
- # )
- # exit()
m_list.append(current_metrics)
all_groups.append(current_metrics)
@@ -587,7 +551,6 @@ def benchmark_mb(
m_list.append(current_metrics)
all_groups.append(current_metrics)
else:
- # print("invalid: " + problem_name)
debug = {
"used_graph": result.used_graph,
"attempts": result.attempts,
@@ -601,28 +564,7 @@ def benchmark_mb(
}
problem["solution_debug"] = debug
- if False:
- world.save_world_as_mesh(problem_name + ".obj")
-
- q_traj = start_state # .unsqueeze(0)
- # CuRobo
- from curobo.util.usd_helper import UsdHelper
-
- UsdHelper.write_trajectory_animation_with_robot_usd(
- robot_cfg,
- world,
- start_state,
- q_traj,
- dt=result.interpolation_dt,
- save_path=join_path("benchmark/log/usd/", problem_name) + ".usd",
- interpolation_steps=1,
- write_robot_usd_path="benchmark/log/usd/assets/",
- robot_usd_local_reference="assets/",
- base_frame="/world_" + problem_name,
- visualize_robot_spheres=True,
- )
if save_log and not result.success.item():
- # print("save log")
UsdHelper.write_motion_gen_log(
result,
robot_cfg,
@@ -637,8 +579,7 @@ def benchmark_mb(
write_robot_usd_path="benchmark/log/usd/assets/",
# flatten_usd=True,
)
- # print(result.status)
- # exit()
+ print(result.status)
g_m = CuroboGroupMetrics.from_list(m_list)
if not args.kpi:
@@ -653,10 +594,6 @@ def benchmark_mb(
g_m.motion_time.percent_98,
)
print(g_m.attempts)
- # print("MT: ", g_m.motion_time)
- # $print(ik_fail)
- # exit()
- # print(g_m.position_error, g_m.orientation_error)
g_m = CuroboGroupMetrics.from_list(all_groups)
if not args.kpi:
@@ -668,12 +605,8 @@ def benchmark_mb(
g_m.time.percent_75,
g_m.position_error.percent_75,
g_m.orientation_error.percent_75,
- ) # g_m.time, g_m.attempts)
- # print("MT: ", g_m.motion_time)
+ )
- # print(g_m.position_error, g_m.orientation_error)
-
- # exit()
if write_benchmark:
if not mpinets_data:
write_yaml(problems, args.file_name + "_mb_solution.yaml")
@@ -681,7 +614,6 @@ def benchmark_mb(
write_yaml(problems, args.file_name + "_mpinets_solution.yaml")
all_files += all_groups
g_m = CuroboGroupMetrics.from_list(all_files)
- # print(g_m.success, g_m.time, g_m.attempts, g_m.position_error, g_m.orientation_error)
print("######## FULL SET ############")
print("All: ", f"{g_m.success:2.2f}")
print("MT: ", g_m.motion_time)
@@ -690,6 +622,7 @@ def benchmark_mb(
print("ST: ", g_m.solve_time)
print("position error (mm): ", g_m.position_error)
print("orientation error(%): ", g_m.orientation_error)
+ print("jerk: ", g_m.jerk)
if args.kpi:
kpi_data = {
@@ -702,8 +635,6 @@ def benchmark_mb(
}
write_yaml(kpi_data, join_path(args.save_path, args.file_name + ".yml"))
- # run on mb dataset:
-
def check_problems(all_problems):
n_cube = 0
@@ -801,12 +732,13 @@ if __name__ == "__main__":
args = parser.parse_args()
setup_curobo_logger("error")
- benchmark_mb(
- save_log=False,
- write_usd=args.save_usd,
- write_plot=args.save_plot,
- write_benchmark=args.write_benchmark,
- plot_cost=False,
- graph_mode=args.graph,
- args=args,
- )
+ for _ in range(1):
+ benchmark_mb(
+ save_log=False,
+ write_usd=args.save_usd,
+ write_plot=args.save_plot,
+ write_benchmark=args.write_benchmark,
+ plot_cost=False,
+ graph_mode=args.graph,
+ args=args,
+ )
diff --git a/benchmark/curobo_nvblox_benchmark.py b/benchmark/curobo_nvblox_benchmark.py
index cefb69c..3e6ffde 100644
--- a/benchmark/curobo_nvblox_benchmark.py
+++ b/benchmark/curobo_nvblox_benchmark.py
@@ -18,7 +18,6 @@ from typing import Optional
import matplotlib.pyplot as plt
import numpy as np
import torch
-from metrics import CuroboGroupMetrics, CuroboMetrics
from nvblox_torch.datasets.mesh_dataset import MeshDataset
from nvblox_torch.datasets.sun3d_dataset import Sun3dDataset
from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
@@ -34,6 +33,7 @@ from curobo.types.math import Pose
from curobo.types.robot import RobotConfig
from curobo.types.state import JointState
from curobo.util.logger import setup_curobo_logger
+from curobo.util.metrics import CuroboGroupMetrics, CuroboMetrics
from curobo.util_file import (
get_assets_path,
get_robot_configs_path,
@@ -126,11 +126,12 @@ def load_curobo(
mpinets: bool = False,
graph_mode: bool = False,
cuda_graph: bool = True,
+ collision_activation_distance: float = 0.025,
):
robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
- robot_cfg["kinematics"]["collision_sphere_buffer"] = -0.0
+ robot_cfg["kinematics"]["collision_sphere_buffer"] = 0.0
- ik_seeds = 30 # 500
+ ik_seeds = 30
if graph_mode:
trajopt_seeds = 4
if trajopt_seeds >= 14:
@@ -146,7 +147,7 @@ def load_curobo(
"world": {
"pose": [0, 0, 0, 1, 0, 0, 0],
"integrator_type": "tsdf",
- "voxel_size": 0.014,
+ "voxel_size": 0.01,
}
}
}
@@ -174,16 +175,17 @@ def load_curobo(
store_ik_debug=enable_debug,
store_trajopt_debug=enable_debug,
interpolation_steps=interpolation_steps,
- collision_activation_distance=0.01,
+ collision_activation_distance=collision_activation_distance,
trajopt_dt=0.25,
finetune_dt_scale=1.0,
maximum_trajectory_dt=0.1,
+ finetune_trajopt_iters=300,
)
mg = MotionGen(motion_gen_config)
- mg.warmup(enable_graph=True, warmup_js_trajopt=False)
+ mg.warmup(enable_graph=True, warmup_js_trajopt=False, parallel_finetune=True)
# create a ground truth collision checker:
config = RobotWorldConfig.load_from_config(
- robot_cfg,
+ robot_cfg_instance,
"collision_table.yml",
collision_activation_distance=0.0,
collision_checker_type=CollisionCheckerType.PRIMITIVE,
@@ -206,7 +208,7 @@ def benchmark_mb(
# load dataset:
graph_mode = args.graph
interpolation_dt = 0.02
- file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][1:]
+ file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][2:]
enable_debug = save_log or plot_cost
all_files = []
@@ -215,43 +217,73 @@ def benchmark_mb(
og_tsteps = override_tsteps
og_trajopt_seeds = 12
-
+ og_collision_activation_distance = 0.03 # 0.03
if args.graph:
og_trajopt_seeds = 4
for file_path in file_paths:
all_groups = []
mpinets_data = False
problems = file_path()
- if "dresser_task_oriented" in list(problems.keys()):
- mpinets_data = True
-
- mg, robot_cfg, robot_world = load_curobo(
- 1,
- enable_debug,
- og_tsteps,
- og_trajopt_seeds,
- mpinets_data,
- graph_mode,
- not args.disable_cuda_graph,
- )
for key, v in tqdm(problems.items()):
scene_problems = problems[key]
m_list = []
- i = 0
+ i = -1
ik_fail = 0
+ trajopt_seeds = og_trajopt_seeds
+ tsteps = og_tsteps
+ collision_activation_distance = og_collision_activation_distance
+ if "dresser_task_oriented" in list(problems.keys()):
+ mpinets_data = True
+
+ if "cage_panda" in key:
+ trajopt_seeds = 16
+ finetune_dt_scale = 0.95
+ if "table_under_pick_panda" in key:
+ tsteps = 44
+ trajopt_seeds = 16
+ finetune_dt_scale = 0.98
+
+ if "cubby_task_oriented" in key and "merged" not in key:
+ trajopt_seeds = 24
+ finetune_dt_scale = 0.95
+ collision_activation_distance = 0.035
+ if "dresser_task_oriented" in key:
+ trajopt_seeds = 24
+ finetune_dt_scale = 0.95
+ collision_activation_distance = 0.035 # 0.035
+
+ if "merged_cubby_task_oriented" in key:
+ collision_activation_distance = 0.025 # 0.035
+ if "tabletop_task_oriented" in key:
+ collision_activation_distance = 0.025 # 0.035
+ if key in ["dresser_neutral_goal"]:
+ trajopt_seeds = 24
+ collision_activation_distance = og_collision_activation_distance
+ mg, robot_cfg, robot_world = load_curobo(
+ 1,
+ enable_debug,
+ tsteps,
+ trajopt_seeds,
+ mpinets_data,
+ graph_mode,
+ not args.disable_cuda_graph,
+ collision_activation_distance=collision_activation_distance,
+ )
for problem in tqdm(scene_problems, leave=False):
i += 1
+ if problem["collision_buffer_ik"] < 0.0:
+ continue
plan_config = MotionGenPlanConfig(
- max_attempts=10, # 00, # 00, # 100, # 00, # 000,#,00,#00, # 5000,
- enable_graph_attempt=3,
- disable_graph_attempt=20,
+ max_attempts=10,
+ enable_graph_attempt=1,
enable_finetune_trajopt=True,
partial_ik_opt=False,
enable_graph=graph_mode,
timeout=60,
enable_opt=not graph_mode,
+ parallel_finetune=True,
)
q_start = problem["start"]
@@ -265,23 +297,15 @@ def benchmark_mb(
mg.reset(reset_seed=False)
world = WorldConfig.from_dict(problem["obstacles"])
- # .get_mesh_world(
- # # merge_meshes=True
- # )
- # mesh = world.mesh[0].get_trimesh_mesh()
-
- # world.save_world_as_mesh(problem_name + ".stl")
+ mg.world_coll_checker.clear_cache()
mg.world_coll_checker.update_blox_hashes()
- mg.world_coll_checker.clear_cache()
save_path = "benchmark/log/nvblox/" + key + "_" + str(i)
m_dataset = Sun3dDataset(save_path)
- # m_dataset = MeshDataset(
- # None, n_frames=100, image_size=640, save_data_dir=None, trimesh_mesh=mesh
- # )
+
tensor_args = mg.tensor_args
- for j in tqdm(range(len(m_dataset)), leave=False):
+ for j in tqdm(range(min(1, len(m_dataset))), leave=False):
data = m_dataset[j]
cam_obs = CameraObservation(
rgb_image=tensor_args.to_device(data["rgba"])
@@ -300,35 +324,12 @@ def benchmark_mb(
torch.cuda.synchronize()
mg.world_coll_checker.update_blox_hashes()
torch.cuda.synchronize()
+ # if i > 2:
+ # mg.world_coll_checker.clear_cache()
+ # mg.world_coll_checker.update_blox_hashes()
+
# mg.world_coll_checker.save_layer("world", "test.nvblx")
- if save_log or write_usd:
- world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
-
- nvblox_obs = mg.world_coll_checker.get_mesh_from_blox_layer(
- "world",
- )
- # nvblox_obs.vertex_colors = None
-
- if nvblox_obs.vertex_colors is not None:
- nvblox_obs.vertex_colors = nvblox_obs.vertex_colors.cpu().numpy()
- else:
- nvblox_obs.color = [0.0, 0.0, 0.8, 0.8]
-
- nvblox_obs.name = "nvblox_mesh_world"
- world.add_obstacle(nvblox_obs)
-
- coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box(
- curobo_Cuboid(name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1.5, 1.5, 1]),
- voxel_size=0.005,
- )
-
- coll_mesh.color = [0.0, 0.8, 0.8, 0.8]
-
- coll_mesh.name = "nvblox_voxel_world"
- world.add_obstacle(coll_mesh)
- # exit()
- # run planner
start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
result = mg.plan_single(
start_state,
@@ -338,20 +339,6 @@ def benchmark_mb(
if result.status == "IK Fail":
ik_fail += 1
problem["solution"] = None
- if write_usd or save_log:
- # CuRobo
- from curobo.util.usd_helper import UsdHelper
-
- gripper_mesh = Mesh(
- name="target_gripper_1",
- file_path=join_path(
- get_assets_path(),
- "robot/franka_description/meshes/visual/hand.dae",
- ),
- color=[0.0, 0.8, 0.1, 1.0],
- pose=pose,
- )
- world.add_obstacle(gripper_mesh)
# get costs:
if plot_cost:
@@ -437,9 +424,12 @@ def benchmark_mb(
"valid_query": result.valid_query,
}
problem["solution_debug"] = debug
+ world_coll = WorldConfig.from_dict(problem["obstacles"]).get_obb_world()
+
# check if path is collision free w.r.t. ground truth mesh:
- robot_world.world_model.clear_cache()
- robot_world.update_world(world)
+ # robot_world.world_model.clear_cache()
+ robot_world.update_world(world_coll)
+
q_int_traj = result.get_interpolated_plan().position.unsqueeze(0)
d_int_mask = (
torch.count_nonzero(~robot_world.validate_trajectory(q_int_traj)) == 0
@@ -449,7 +439,14 @@ def benchmark_mb(
d_mask = (
torch.count_nonzero(~robot_world.validate_trajectory(q_traj)) == 0
).item()
-
+ # d_world, _ = robot_world.get_world_self_collision_distance_from_joint_trajectory(
+ # q_traj)
+ # thres_dist = robot_world.contact_distance
+ # in_collision = d_world.squeeze(0) > thres_dist
+ # d_mask = not torch.any(in_collision, dim=-1).item()
+ # if not d_mask:
+ # write_usd = True
+ # #print(torch.max(d_world).item(), problem_name)
current_metrics = CuroboMetrics(
skip=False,
success=True,
@@ -466,10 +463,40 @@ def benchmark_mb(
attempts=result.attempts,
motion_time=result.motion_time.item(),
solve_time=result.solve_time,
+ jerk=torch.max(torch.abs(result.optimized_plan.jerk)).item(),
)
+ if save_log or write_usd:
+ world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
+
+ nvblox_obs = mg.world_coll_checker.get_mesh_from_blox_layer(
+ "world",
+ )
+ # nvblox_obs.vertex_colors = None
+
+ if nvblox_obs.vertex_colors is not None:
+ nvblox_obs.vertex_colors = nvblox_obs.vertex_colors.cpu().numpy()
+ else:
+ nvblox_obs.color = [0.0, 0.0, 0.8, 0.8]
+
+ nvblox_obs.name = "nvblox_mesh_world"
+ world.add_obstacle(nvblox_obs)
+
+ coll_mesh = mg.world_coll_checker.get_mesh_in_bounding_box(
+ curobo_Cuboid(
+ name="test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1.5, 1.5, 1]
+ ),
+ voxel_size=0.005,
+ )
+
+ coll_mesh.color = [0.0, 0.8, 0.8, 0.8]
+
+ coll_mesh.name = "nvblox_voxel_world"
+ world.add_obstacle(coll_mesh)
+ # run planner
if write_usd:
# CuRobo
+ from curobo.util.usd_helper import UsdHelper
q_traj = result.get_interpolated_plan()
UsdHelper.write_trajectory_animation_with_robot_usd(
@@ -486,7 +513,8 @@ def benchmark_mb(
visualize_robot_spheres=True,
# flatten_usd=True,
)
- exit()
+ # write_usd = False
+ # exit()
if write_plot:
problem_name = problem_name
plot_traj(
@@ -540,6 +568,9 @@ def benchmark_mb(
}
problem["solution_debug"] = debug
if save_log and not result.success.item():
+ # CuRobo
+ from curobo.util.usd_helper import UsdHelper
+
UsdHelper.write_motion_gen_log(
result,
robot_cfg,
@@ -553,7 +584,7 @@ def benchmark_mb(
grid_space=2,
# flatten_usd=True,
)
- # exit()
+ exit()
g_m = CuroboGroupMetrics.from_list(m_list)
print(
key,
@@ -566,7 +597,7 @@ def benchmark_mb(
g_m.orientation_error.percent_98,
g_m.cspace_path_length.percent_98,
g_m.motion_time.percent_98,
- g_m.perception_success,
+ g_m.perception_interpolated_success,
# g_m.orientation_error.median,
)
print(g_m.attempts)
@@ -600,6 +631,7 @@ def benchmark_mb(
print("PT:", g_m.time)
print("ST: ", g_m.solve_time)
print("accuracy: ", g_m.position_error, g_m.orientation_error)
+ print("Jerk: ", g_m.jerk)
if __name__ == "__main__":
diff --git a/benchmark/curobo_profile.py b/benchmark/curobo_profile.py
index c155718..77cbcb2 100644
--- a/benchmark/curobo_profile.py
+++ b/benchmark/curobo_profile.py
@@ -73,8 +73,8 @@ def load_curobo(
position_threshold=0.005,
rotation_threshold=0.05,
num_ik_seeds=30,
- num_trajopt_seeds=10,
- interpolation_dt=0.02,
+ num_trajopt_seeds=12,
+ interpolation_dt=0.025,
# grad_trajopt_iters=200,
store_ik_debug=enable_log,
store_trajopt_debug=enable_log,
@@ -91,7 +91,7 @@ def benchmark_mb(args):
spheres = load_yaml(join_path(get_robot_configs_path(), spheres))["collision_spheres"]
plan_config = MotionGenPlanConfig(
- max_attempts=2,
+ max_attempts=1,
enable_graph_attempt=3,
enable_finetune_trajopt=True,
partial_ik_opt=False,
@@ -130,11 +130,18 @@ def benchmark_mb(args):
world = WorldConfig.from_dict(problem["obstacles"]).get_obb_world()
mg.update_world(world)
+ start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+
+ result = mg.plan_single(
+ start_state,
+ Pose.from_list(pose),
+ plan_config,
+ )
+ print(result.total_time, result.solve_time)
# continue
# load obstacles
# run planner
- start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
result = mg.plan_single(
start_state,
diff --git a/benchmark/generate_nvblox_images.py b/benchmark/generate_nvblox_images.py
index 13b0f83..48681c0 100644
--- a/benchmark/generate_nvblox_images.py
+++ b/benchmark/generate_nvblox_images.py
@@ -43,7 +43,7 @@ def generate_images():
for key, v in tqdm(problems.items()):
scene_problems = problems[key]
- i = 0
+ i = -1
for problem in tqdm(scene_problems, leave=False):
i += 1
diff --git a/benchmark/metrics.py b/benchmark/metrics.py
index 751a4b5..b4cd4c3 100644
--- a/benchmark/metrics.py
+++ b/benchmark/metrics.py
@@ -29,6 +29,7 @@ class CuroboMetrics(TrajectoryMetrics):
cspace_path_length: float = 0.0
perception_success: bool = False
perception_interpolated_success: bool = False
+ jerk: float = np.inf
@dataclass
@@ -37,6 +38,7 @@ class CuroboGroupMetrics(TrajectoryGroupMetrics):
cspace_path_length: Optional[Statistic] = None
perception_success: float = 0.0
perception_interpolated_success: float = 0.0
+ jerk: float = np.inf
@classmethod
def from_list(cls, group: List[CuroboMetrics]):
@@ -49,5 +51,6 @@ class CuroboGroupMetrics(TrajectoryGroupMetrics):
data.perception_interpolated_success = percent_true(
[m.perception_interpolated_success for m in group]
)
+ data.jerk = Statistic.from_list([m.jerk for m in successes])
return data
diff --git a/docker/aarch64.dockerfile b/docker/aarch64.dockerfile
index b7a2130..e1ee188 100644
--- a/docker/aarch64.dockerfile
+++ b/docker/aarch64.dockerfile
@@ -129,24 +129,15 @@ RUN pip3 install trimesh \
empy
# Add cache date to avoid using cached layers older than this
-ARG CACHE_DATE=2023-12-15
-
-# warp from https://github.com/NVIDIA/warp needs to be compiled locally and then
-# placed in curobo/docker/pkgs.
-# Run the following from your terminal:
-# cd curobo/docker && mkdir pkgs && cd pkgs && git clone https://github.com/NVIDIA/warp.git
-# cd warp && python build_libs.py
-#
-# copy pkgs directory:
-COPY pkgs /pkgs
+ARG CACHE_DATE=2024-02-20
# install warp:
#
-RUN cd /pkgs/warp && pip3 install .
+RUN pip3 install warp-lang
# install curobo:
-RUN cd /pkgs && git clone https://github.com/NVlabs/curobo.git
+RUN mkdir /pkgs && git clone https://github.com/NVlabs/curobo.git
ENV TORCH_CUDA_ARCH_LIST "7.0+PTX"
@@ -157,6 +148,7 @@ WORKDIR /pkgs/curobo
# Optionally install nvblox:
ENV PYOPENGL_PLATFORM=egl
+
RUN apt-get update && \
apt-get install -y libgoogle-glog-dev libgtest-dev curl libsqlite3-dev libbenchmark-dev && \
cd /usr/src/googletest && cmake . && cmake --build . --target install && \
@@ -172,6 +164,7 @@ RUN cd /pkgs && git clone https://github.com/nvlabs/nvblox_torch.git && \
cd nvblox_torch && \
sh install.sh $(python3 -c 'import torch.utils; print(torch.utils.cmake_prefix_path)') && \
python3 -m pip install -e .
+
RUN python -m pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"
diff --git a/docker/user.dockerfile b/docker/user.dockerfile
index ad17095..50bc6dc 100644
--- a/docker/user.dockerfile
+++ b/docker/user.dockerfile
@@ -15,6 +15,7 @@ FROM curobo_docker:${IMAGE_TAG}
# Set variables
ARG USERNAME
ARG USER_ID
+ARG CACHE_DATE=2024-02-20
# Set environment variables
diff --git a/docker/x86.dockerfile b/docker/x86.dockerfile
index 9156293..4725bf3 100644
--- a/docker/x86.dockerfile
+++ b/docker/x86.dockerfile
@@ -78,7 +78,7 @@ ENV TORCH_CUDA_ARCH_LIST "7.0+PTX"
ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
# Add cache date to avoid using cached layers older than this
-ARG CACHE_DATE=2023-12-15
+ARG CACHE_DATE=2024-02-20
RUN pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"
diff --git a/examples/isaac_sim/constrained_reacher.py b/examples/isaac_sim/constrained_reacher.py
new file mode 100644
index 0000000..83f6041
--- /dev/null
+++ b/examples/isaac_sim/constrained_reacher.py
@@ -0,0 +1,355 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+ {
+ "headless": False,
+ "width": "1920",
+ "height": "1080",
+ }
+)
+
+# Third Party
+import numpy as np
+import torch
+from helper import add_extensions, add_robot_to_scene
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+from curobo.wrap.reacher.motion_gen import (
+ MotionGen,
+ MotionGenConfig,
+ MotionGenPlanConfig,
+ PoseCostMetric,
+)
+
+simulation_app.update()
+# Standard Library
+import argparse
+
+# Third Party
+import carb
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniGlass, OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.usd_helper import UsdHelper
+
+parser = argparse.ArgumentParser()
+
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+
+args = parser.parse_args()
+
+
+if __name__ == "__main__":
+ my_world = World(stage_units_in_meters=1.0)
+ stage = my_world.stage
+ n_obstacle_cuboids = 10
+ n_obstacle_mesh = 10
+
+ stage = my_world.stage
+ my_world.scene.add_default_ground_plane()
+
+ xform = stage.DefinePrim("/World", "Xform")
+ stage.SetDefaultPrim(xform)
+ target_material = OmniPBR("/World/looks/t", color=np.array([0, 1, 0]))
+ target_material_2 = OmniPBR("/World/looks/t2", color=np.array([0, 1, 0]))
+ target_material_plane = OmniGlass(
+ "/World/looks/t3", color=np.array([0, 1, 0]), ior=1.25, depth=0.001, thin_walled=False
+ )
+ target_material_line = OmniGlass(
+ "/World/looks/t4", color=np.array([0, 1, 0]), ior=1.25, depth=0.001, thin_walled=True
+ )
+
+ # target_orient = [0,0,0.707,0.707]
+ target_orient = [0.5, -0.5, 0.5, 0.5]
+
+ target = cuboid.VisualCuboid(
+ "/World/target_1",
+ position=np.array([0.55, -0.3, 0.5]),
+ orientation=np.array(target_orient),
+ size=0.04,
+ visual_material=target_material,
+ )
+
+ # Make a target to follow
+ target_2 = cuboid.VisualCuboid(
+ "/World/target_2",
+ position=np.array([0.55, 0.4, 0.5]),
+ orientation=np.array(target_orient),
+ size=0.04,
+ visual_material=target_material_2,
+ )
+
+ x_plane = cuboid.VisualCuboid(
+ "/World/constraint_plane",
+ position=np.array([0.55, 0.05, 0.5]),
+ orientation=np.array(target_orient),
+ scale=[1.1, 0.001, 1.0],
+ visual_material=target_material_plane,
+ )
+ xz_line = cuboid.VisualCuboid(
+ "/World/constraint_line",
+ position=np.array([0.55, 0.05, 0.5]),
+ orientation=np.array(target_orient),
+ scale=[0.04, 0.04, 0.65],
+ visual_material=target_material_line,
+ )
+
+ collision_checker_type = CollisionCheckerType.BLOX
+
+ tensor_args = TensorDeviceType()
+
+ robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+ j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+ default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+ robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world, "/World/world_robot/")
+
+ world_cfg_table = WorldConfig.from_dict(
+ load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+ )
+
+ world_cfg_table.cuboid[0].pose[2] -= 0.01
+ world_cfg1 = WorldConfig.from_dict(
+ load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+ ).get_mesh_world()
+ world_cfg1.mesh[0].name += "_mesh"
+ world_cfg1.mesh[0].pose[2] = -10.5
+
+ world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+ usd_help = UsdHelper()
+
+ usd_help.load_stage(my_world.stage)
+ usd_help.add_world_to_stage(world_cfg_table.get_mesh_world(), base_frame="/World")
+ motion_gen_config = MotionGenConfig.load_from_robot_config(
+ robot_cfg,
+ world_cfg,
+ tensor_args,
+ collision_checker_type=CollisionCheckerType.MESH,
+ collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+ velocity_scale=0.75,
+ interpolation_dt=0.02,
+ ee_link_name="right_gripper",
+ )
+ motion_gen = MotionGen(motion_gen_config)
+ print("warming up..")
+ motion_gen.warmup(warmup_js_trajopt=False)
+
+ world_model = motion_gen.world_collision
+
+ i = 0
+ tensor_args = TensorDeviceType()
+ target_list = [target, target_2]
+ target_material_list = [target_material, target_material_2]
+ for material in target_material_list:
+ material.set_color(np.array([0.1, 0.1, 0.1]))
+ target_material_plane.set_color(np.array([1, 1, 1]))
+ target_material_line.set_color(np.array([1, 1, 1]))
+
+ target_idx = 0
+ cmd_idx = 0
+ cmd_plan = None
+ articulation_controller = robot.get_articulation_controller()
+ plan_config = MotionGenPlanConfig(
+ enable_graph=False, enable_graph_attempt=4, max_attempts=2, enable_finetune_trajopt=True
+ )
+
+ plan_idx = 0
+ cmd_step_idx = 0
+ pose_cost_metric = None
+ while simulation_app.is_running():
+ my_world.step(render=True)
+
+ if not my_world.is_playing():
+ if i % 100 == 0:
+ print("**** Click Play to start simulation *****")
+ i += 1
+ # if step_index == 0:
+ # my_world.play()
+ continue
+ step_index = my_world.current_time_step_index
+
+ if step_index <= 2:
+ my_world.reset()
+ idx_list = [robot.get_dof_index(x) for x in j_names]
+ robot.set_joint_positions(default_config, idx_list)
+
+ robot._articulation_view.set_max_efforts(
+ values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+ )
+
+ if False and step_index % 50 == 0.0: # No obstacle update
+ obstacles = usd_help.get_obstacles_from_stage(
+ # only_paths=[obstacles_path],
+ reference_prim_path=robot_prim_path,
+ ignore_substring=[
+ robot_prim_path,
+ "/World/target",
+ "/World/defaultGroundPlane",
+ "/curobo",
+ ],
+ ).get_collision_check_world()
+ # print(len(obstacles.objects))
+
+ motion_gen.update_world(obstacles)
+ # print("Updated World")
+ carb.log_info("Synced CuRobo world from stage.")
+
+ linear_color = np.ravel([249, 87, 56]) / 255.0
+ orient_color = np.ravel([103, 148, 54]) / 255.0
+
+ disable_color = np.ravel([255, 255, 255]) / 255.0
+
+ if cmd_plan is None and step_index % 10 == 0:
+
+ if plan_idx == 4:
+ print("Constrained: Holding tool linear-y")
+ pose_cost_metric = PoseCostMetric(
+ hold_partial_pose=True,
+ hold_vec_weight=motion_gen.tensor_args.to_device([0, 0, 0, 0, 1, 0]),
+ )
+ target_material_plane.set_color(linear_color)
+ if plan_idx == 8:
+ print("Constrained: Holding tool Orientation and linear-y")
+ pose_cost_metric = PoseCostMetric(
+ hold_partial_pose=True,
+ hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 0, 1, 0]),
+ )
+ target_material_plane.set_color(orient_color)
+ if plan_idx == 12:
+ print("Constrained: Holding tool linear-y, linear-x")
+ pose_cost_metric = PoseCostMetric(
+ hold_partial_pose=True,
+ hold_vec_weight=motion_gen.tensor_args.to_device([0, 0, 0, 1, 1, 0]),
+ )
+ target_material_line.set_color(linear_color)
+ target_material_plane.set_color(disable_color)
+
+ if plan_idx == 16:
+ print("Constrained: Holding tool Orientation and linear-y, linear-x")
+ pose_cost_metric = PoseCostMetric(
+ hold_partial_pose=True,
+ hold_vec_weight=motion_gen.tensor_args.to_device([1, 1, 1, 1, 1, 0]),
+ )
+ target_material_line.set_color(orient_color)
+ target_material_plane.set_color(disable_color)
+
+ if plan_idx > 20:
+ plan_idx = 0
+
+ if plan_idx == 0:
+ print("Constrained: Reset")
+ target_material_line.set_color(disable_color)
+ target_material_plane.set_color(disable_color)
+
+ pose_cost_metric = None
+
+ plan_config.pose_cost_metric = pose_cost_metric
+
+ # motion generation:
+ for ks in range(len(target_material_list)):
+ if ks == target_idx:
+ target_material_list[ks].set_color(np.ravel([0, 1.0, 0]))
+ else:
+ target_material_list[ks].set_color(np.ravel([0.1, 0.1, 0.1]))
+
+ sim_js = robot.get_joints_state()
+ sim_js_names = robot.dof_names
+ cu_js = JointState(
+ position=tensor_args.to_device(sim_js.positions),
+ velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+ acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+ jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+ joint_names=sim_js_names,
+ )
+ cu_js = cu_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+
+ cube_position, cube_orientation = target_list[target_idx].get_world_pose()
+
+ # Set EE teleop goals, use cube for simple non-vr init:
+ ee_translation_goal = cube_position
+ ee_orientation_teleop_goal = cube_orientation
+
+ # compute curobo solution:
+ ik_goal = Pose(
+ position=tensor_args.to_device(ee_translation_goal),
+ quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+ )
+ result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
+ # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+ succ = result.success.item() # ik_result.success.item()
+ plan_idx += 1
+ if succ:
+ cmd_plan = result.get_interpolated_plan()
+ cmd_plan = motion_gen.get_full_js(cmd_plan)
+ # get only joint names that are in both:
+ idx_list = []
+ common_js_names = []
+ for x in sim_js_names:
+ if x in cmd_plan.joint_names:
+ idx_list.append(robot.get_dof_index(x))
+ common_js_names.append(x)
+ # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+ cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+ cmd_idx = 0
+ target_idx += 1
+ if target_idx >= len(target_list):
+ target_idx = 0
+
+ else:
+ carb.log_warn("Plan did not converge to a solution. No action is being taken.")
+ if cmd_plan is not None:
+ cmd_state = cmd_plan[cmd_idx]
+
+ # get full dof state
+ art_action = ArticulationAction(
+ cmd_state.position.cpu().numpy(),
+ cmd_state.velocity.cpu().numpy(),
+ joint_indices=idx_list,
+ )
+ # set desired joint angles obtained from IK:
+ articulation_controller.apply_action(art_action)
+ cmd_step_idx += 1
+ if cmd_step_idx == 2:
+ cmd_idx += 1
+ cmd_step_idx = 0
+ # for _ in range(2):
+ # my_world.step(render=False)
+ if cmd_idx >= len(cmd_plan.position):
+ cmd_idx = 0
+ cmd_plan = None
+ print("finished program")
+
+ simulation_app.close()
diff --git a/examples/isaac_sim/motion_gen_reacher.py b/examples/isaac_sim/motion_gen_reacher.py
index 95d4749..6461fd7 100644
--- a/examples/isaac_sim/motion_gen_reacher.py
+++ b/examples/isaac_sim/motion_gen_reacher.py
@@ -51,6 +51,32 @@ parser.add_argument(
help="When True, runs in reactive mode",
default=False,
)
+
+parser.add_argument(
+ "--constrain_grasp_approach",
+ action="store_true",
+ help="When True, approaches grasp with fixed orientation and motion only along z axis.",
+ default=False,
+)
+
+parser.add_argument(
+ "--reach_partial_pose",
+ nargs=6,
+ metavar=("qx", "qy", "qz", "x", "y", "z"),
+ help="Reach partial pose",
+ type=float,
+ default=None,
+)
+parser.add_argument(
+ "--hold_partial_pose",
+ nargs=6,
+ metavar=("qx", "qy", "qz", "x", "y", "z"),
+ help="Hold partial pose while moving to goal",
+ type=float,
+ default=None,
+)
+
+
args = parser.parse_args()
############################################################
@@ -97,7 +123,12 @@ from curobo.util_file import (
join_path,
load_yaml,
)
-from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+from curobo.wrap.reacher.motion_gen import (
+ MotionGen,
+ MotionGenConfig,
+ MotionGenPlanConfig,
+ PoseCostMetric,
+)
############################################################
@@ -107,7 +138,7 @@ from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGen
def main():
# create a curobo motion gen instance:
-
+ num_targets = 0
# assuming obstacles are in objects_path:
my_world = World(stage_units_in_meters=1.0)
stage = my_world.stage
@@ -152,12 +183,12 @@ def main():
robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
- articulation_controller = robot.get_articulation_controller()
+ articulation_controller = None
world_cfg_table = WorldConfig.from_dict(
load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
)
- world_cfg_table.cuboid[0].pose[2] -= 0.04
+ world_cfg_table.cuboid[0].pose[2] -= 0.02
world_cfg1 = WorldConfig.from_dict(
load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
).get_mesh_world()
@@ -171,12 +202,14 @@ def main():
trajopt_tsteps = 32
trim_steps = None
max_attempts = 4
+ interpolation_dt = 0.05
if args.reactive:
- trajopt_tsteps = 36
+ trajopt_tsteps = 40
trajopt_dt = 0.05
optimize_dt = False
- max_attemtps = 1
+ max_attempts = 1
trim_steps = [1, None]
+ interpolation_dt = trajopt_dt
motion_gen_config = MotionGenConfig.load_from_robot_config(
robot_cfg,
world_cfg,
@@ -184,12 +217,15 @@ def main():
collision_checker_type=CollisionCheckerType.MESH,
num_trajopt_seeds=12,
num_graph_seeds=12,
- interpolation_dt=0.05,
+ interpolation_dt=interpolation_dt,
collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
optimize_dt=optimize_dt,
trajopt_dt=trajopt_dt,
trajopt_tsteps=trajopt_tsteps,
trim_steps=trim_steps,
+ # velocity_scale=0.1,
+ # self_collision_check=False,
+ # self_collision_opt=False,
)
motion_gen = MotionGen(motion_gen_config)
print("warming up...")
@@ -216,6 +252,9 @@ def main():
i = 0
spheres = None
past_cmd = None
+ target_orientation = None
+ past_orientation = None
+ pose_metric = None
while simulation_app.is_running():
my_world.step(render=True)
if not my_world.is_playing():
@@ -228,6 +267,9 @@ def main():
step_index = my_world.current_time_step_index
# print(step_index)
+ if articulation_controller is None:
+ # robot.initialize()
+ articulation_controller = robot.get_articulation_controller()
if step_index < 2:
my_world.reset()
robot._articulation_view.initialize()
@@ -265,6 +307,10 @@ def main():
past_pose = cube_position
if target_pose is None:
target_pose = cube_position
+ if target_orientation is None:
+ target_orientation = cube_orientation
+ if past_orientation is None:
+ past_orientation = cube_orientation
sim_js = robot.get_joints_state()
sim_js_names = robot.dof_names
@@ -313,8 +359,12 @@ def main():
if (np.max(np.abs(sim_js.velocities)) < 0.2) or args.reactive:
robot_static = True
if (
- np.linalg.norm(cube_position - target_pose) > 1e-3
+ (
+ np.linalg.norm(cube_position - target_pose) > 1e-3
+ or np.linalg.norm(cube_orientation - target_orientation) > 1e-3
+ )
and np.linalg.norm(past_pose - cube_position) == 0.0
+ and np.linalg.norm(past_orientation - cube_orientation) == 0.0
and robot_static
):
# Set EE teleop goals, use cube for simple non-vr init:
@@ -326,12 +376,24 @@ def main():
position=tensor_args.to_device(ee_translation_goal),
quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
)
-
+ plan_config.pose_cost_metric = pose_metric
result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
# ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
succ = result.success.item() # ik_result.success.item()
+ if num_targets == 1:
+ if args.constrain_grasp_approach:
+ pose_metric = PoseCostMetric.create_grasp_approach_metric()
+ if args.reach_partial_pose is not None:
+ reach_vec = motion_gen.tensor_args.to_device(args.reach_partial_pose)
+ pose_metric = PoseCostMetric(
+ reach_partial_pose=True, reach_vec_weight=reach_vec
+ )
+ if args.hold_partial_pose is not None:
+ hold_vec = motion_gen.tensor_args.to_device(args.hold_partial_pose)
+ pose_metric = PoseCostMetric(hold_partial_pose=True, hold_vec_weight=hold_vec)
if succ:
+ num_targets += 1
cmd_plan = result.get_interpolated_plan()
cmd_plan = motion_gen.get_full_js(cmd_plan)
# get only joint names that are in both:
@@ -350,7 +412,9 @@ def main():
else:
carb.log_warn("Plan did not converge to a solution. No action is being taken.")
target_pose = cube_position
+ target_orientation = cube_orientation
past_pose = cube_position
+ past_orientation = cube_orientation
if cmd_plan is not None:
cmd_state = cmd_plan[cmd_idx]
past_cmd = cmd_state.clone()
diff --git a/examples/isaac_sim/simple_stacking.py b/examples/isaac_sim/simple_stacking.py
index a72bf83..5d3422f 100644
--- a/examples/isaac_sim/simple_stacking.py
+++ b/examples/isaac_sim/simple_stacking.py
@@ -34,6 +34,13 @@ parser.add_argument(
default=None,
help="To run headless, use one of [native, websocket], webrtc might not work.",
)
+
+parser.add_argument(
+ "--constrain_grasp_approach",
+ action="store_true",
+ help="When True, approaches grasp with fixed orientation and motion only along z axis.",
+ default=False,
+)
args = parser.parse_args()
# Third Party
@@ -78,6 +85,7 @@ from curobo.wrap.reacher.motion_gen import (
MotionGenConfig,
MotionGenPlanConfig,
MotionGenResult,
+ PoseCostMetric,
)
@@ -87,6 +95,7 @@ class CuroboController(BaseController):
my_world: World,
my_task: BaseStacking,
name: str = "curobo_controller",
+ constrain_grasp_approach: bool = False,
) -> None:
BaseController.__init__(self, name=name)
self._save_log = False
@@ -145,13 +154,16 @@ class CuroboController(BaseController):
collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
store_ik_debug=self._save_log,
store_trajopt_debug=self._save_log,
- state_finite_difference_mode="CENTRAL",
- acceleration_scale=0.5,
- fixed_iters_trajopt=True,
+ velocity_scale=0.75,
)
self.motion_gen = MotionGen(motion_gen_config)
print("warming up...")
- self.motion_gen.warmup()
+ self.motion_gen.warmup(parallel_finetune=True)
+ pose_metric = None
+ if constrain_grasp_approach:
+ pose_metric = PoseCostMetric.create_grasp_approach_metric(
+ offset_position=0.1, tstep_fraction=0.6
+ )
self.plan_config = MotionGenPlanConfig(
enable_graph=False,
@@ -159,6 +171,8 @@ class CuroboController(BaseController):
enable_graph_attempt=None,
enable_finetune_trajopt=True,
partial_ik_opt=False,
+ parallel_finetune=True,
+ pose_cost_metric=pose_metric,
)
self.usd_help.load_stage(self.my_world.stage)
self.cmd_plan = None
@@ -185,7 +199,7 @@ class CuroboController(BaseController):
cu_js,
[cube_name],
sphere_fit_type=SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
- world_objects_pose_offset=Pose.from_list([0, 0, 0.005, 1, 0, 0, 0], self.tensor_args),
+ world_objects_pose_offset=Pose.from_list([0, 0, 0.01, 1, 0, 0, 0], self.tensor_args),
)
def detach_obj(self) -> None:
@@ -259,7 +273,7 @@ class CuroboController(BaseController):
# get full dof state
art_action = ArticulationAction(
cmd_state.position.cpu().numpy(),
- # cmd_state.velocity.cpu().numpy(),
+ cmd_state.velocity.cpu().numpy() * 0.0,
joint_indices=self.idx_list,
)
if self.cmd_idx >= len(self.cmd_plan.position):
@@ -417,7 +431,9 @@ my_world.add_task(my_task)
my_world.reset()
robot_name = my_task.get_params()["robot_name"]["value"]
my_franka = my_world.scene.get_object(robot_name)
-my_controller = CuroboController(my_world=my_world, my_task=my_task)
+my_controller = CuroboController(
+ my_world=my_world, my_task=my_task, constrain_grasp_approach=args.constrain_grasp_approach
+)
articulation_controller = my_franka.get_articulation_controller()
set_camera_view(eye=[2, 0, 1], target=[0.00, 0.00, 0.00], camera_prim_path="/OmniverseKit_Persp")
wait_steps = 8
@@ -439,17 +455,17 @@ print(articulation_controller.get_gains())
print(articulation_controller.get_max_efforts())
robot = my_franka
print("**********************")
-robot.enable_gravity()
-articulation_controller.set_gains(
- kps=np.array(
- [100000000, 6000000.0, 10000000, 600000.0, 25000.0, 15000.0, 50000.0, 6000.0, 6000.0]
+if True:
+ robot.enable_gravity()
+ articulation_controller.set_gains(
+ kps=np.array(
+ [100000000, 6000000.0, 10000000, 600000.0, 25000.0, 15000.0, 50000.0, 6000.0, 6000.0]
+ )
)
-)
-
-articulation_controller.set_max_efforts(
- values=np.array([100000, 52.199997, 100000, 52.199997, 7.2, 7.2, 7.2, 50.0, 50])
-)
+ articulation_controller.set_max_efforts(
+ values=np.array([100000, 52.199997, 100000, 52.199997, 7.2, 7.2, 7.2, 50.0, 50])
+ )
print("Updated gains:")
print(articulation_controller.get_gains())
diff --git a/examples/kinematics_example.py b/examples/kinematics_example.py
index a32827a..9b83829 100644
--- a/examples/kinematics_example.py
+++ b/examples/kinematics_example.py
@@ -15,7 +15,7 @@
import torch
# CuRobo
-from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
from curobo.types.base import TensorDeviceType
from curobo.types.robot import RobotConfig
from curobo.util.logger import setup_curobo_logger
diff --git a/examples/motion_gen_example.py b/examples/motion_gen_example.py
index 789f33b..820d8c1 100644
--- a/examples/motion_gen_example.py
+++ b/examples/motion_gen_example.py
@@ -42,8 +42,9 @@ def plot_traj(trajectory, dt):
axs[3].plot(timesteps, qddd[:, i], label=str(i))
plt.legend()
- # plt.savefig("test.png")
- plt.show()
+ plt.savefig("test.png")
+ plt.close()
+ # plt.show()
def plot_iters_traj(trajectory, d_id=1, dof=7, seed=0):
@@ -140,37 +141,54 @@ def demo_motion_gen(js=False):
world_file,
tensor_args,
interpolation_dt=0.01,
+ # trajopt_dt=0.15,
+ # velocity_scale=0.1,
+ use_cuda_graph=True,
+ # finetune_dt_scale=2.5,
+ interpolation_steps=10000,
)
motion_gen = MotionGen(motion_gen_config)
+ motion_gen.warmup(parallel_finetune=True)
- motion_gen.warmup(enable_graph=True, warmup_js_trajopt=js)
- robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
- robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+ # motion_gen.warmup(enable_graph=True, warmup_js_trajopt=js, parallel_finetune=True)
+ # robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+ # robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
retract_cfg = motion_gen.get_retract_config()
state = motion_gen.rollout_fn.compute_kinematics(
JointState.from_position(retract_cfg.view(1, -1))
)
retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
- start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.1)
+ start_state = JointState.from_position(retract_cfg.view(1, -1))
goal_state = start_state.clone()
- goal_state.position[..., 3] -= 0.1
+
+ start_state.position[0, 0] += 0.25
+ # goal_state.position[0,0] += 0.5
if js:
result = motion_gen.plan_single_js(
start_state,
goal_state,
- MotionGenPlanConfig(
- max_attempts=1, enable_graph=False, enable_opt=True, enable_finetune_trajopt=True
- ),
+ MotionGenPlanConfig(max_attempts=1, parallel_finetune=True),
)
else:
result = motion_gen.plan_single(
- start_state, retract_pose, MotionGenPlanConfig(max_attempts=1)
+ start_state,
+ retract_pose,
+ MotionGenPlanConfig(
+ max_attempts=1, parallel_finetune=True, enable_finetune_trajopt=True
+ ),
)
- traj = result.get_interpolated_plan()
- print("Trajectory Generated: ", result.success, result.solve_time, result.status)
+ print(
+ "Trajectory Generated: ",
+ result.success,
+ result.solve_time,
+ result.status,
+ result.optimized_dt,
+ )
if PLOT and result.success.item():
+ traj = result.get_interpolated_plan()
+
plot_traj(traj, result.interpolation_dt)
# plt.save("test.png")
# plt.close()
@@ -261,6 +279,45 @@ def demo_motion_gen_batch():
plot_traj(traj[1, : result.path_buffer_last_tstep[1], :].cpu().numpy())
+def demo_motion_gen_goalset():
+ tensor_args = TensorDeviceType()
+ world_file = "collision_cubby.yml"
+ robot_file = "franka.yml"
+ motion_gen_config = MotionGenConfig.load_from_robot_config(
+ robot_file,
+ world_file,
+ tensor_args,
+ collision_checker_type=CollisionCheckerType.PRIMITIVE,
+ use_cuda_graph=True,
+ num_trajopt_seeds=12,
+ num_graph_seeds=1,
+ num_ik_seeds=30,
+ )
+ motion_gen = MotionGen(motion_gen_config)
+ robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+ robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+ retract_cfg = motion_gen.get_retract_config()
+ state = motion_gen.rollout_fn.compute_kinematics(
+ JointState.from_position(retract_cfg.view(1, -1))
+ )
+
+ start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.6)
+
+ state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+ goal_pose = Pose(
+ state.ee_pos_seq.repeat(2, 1).view(1, -1, 3),
+ quaternion=state.ee_quat_seq.repeat(2, 1).view(1, -1, 4),
+ )
+ goal_pose.position[0, 0, 0] -= 0.1
+
+ start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+ m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+ result = motion_gen.plan_goalset(start_state, goal_pose, m_config)
+
+
def demo_motion_gen_api():
tensor_args = TensorDeviceType(device=torch.device("cuda:0"))
interpolation_dt = 0.02
@@ -373,8 +430,9 @@ def demo_motion_gen_batch_env(n_envs: int = 10):
if __name__ == "__main__":
setup_curobo_logger("error")
- # demo_motion_gen(js=True)
- demo_motion_gen_batch()
+ demo_motion_gen(js=False)
+ # demo_motion_gen_batch()
+ # demo_motion_gen_goalset()
# n = [2, 10]
# for n_envs in n:
# demo_motion_gen_batch_env(n_envs=n_envs)
diff --git a/examples/robot_image_segmentation_example.py b/examples/robot_image_segmentation_example.py
new file mode 100644
index 0000000..d98fcdb
--- /dev/null
+++ b/examples/robot_image_segmentation_example.py
@@ -0,0 +1,238 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+""" This example shows how to use cuRobo's kinematics to generate a mask. """
+
+
+# Standard Library
+import time
+
+# Third Party
+import imageio
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+from nvblox_torch.datasets.mesh_dataset import MeshDataset
+from torch.profiler import ProfilerActivity, profile, record_function
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.geom.types import PointCloud, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_segmenter import RobotSegmenter
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+torch.manual_seed(30)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def create_render_dataset(robot_file, save_debug_data: bool = False):
+ # load robot:
+ robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file))
+ robot_dict["robot_cfg"]["kinematics"]["load_link_names_with_mesh"] = True
+ robot_dict["robot_cfg"]["kinematics"]["load_meshes"] = True
+
+ robot_cfg = RobotConfig.from_dict(robot_dict["robot_cfg"])
+
+ kin_model = CudaRobotModel(robot_cfg.kinematics)
+
+ q = kin_model.retract_config
+
+ meshes = kin_model.get_robot_as_mesh(q)
+
+ world = WorldConfig(mesh=meshes[:])
+ world_table = WorldConfig.from_dict(
+ load_yaml(join_path(get_world_configs_path(), "collision_test.yml"))
+ )
+ world_table.cuboid[0].dims = [0.5, 0.5, 0.1]
+ world.add_obstacle(world_table.objects[0])
+ world.add_obstacle(world_table.objects[1])
+ if save_debug_data:
+ world.save_world_as_mesh("scene.stl", process_color=False)
+ robot_mesh = (
+ WorldConfig.create_merged_mesh_world(world, process_color=False).mesh[0].get_trimesh_mesh()
+ )
+
+ mesh_dataset = MeshDataset(
+ None,
+ n_frames=20,
+ image_size=480,
+ save_data_dir=None,
+ trimesh_mesh=robot_mesh,
+ )
+ q_js = JointState(position=q, joint_names=kin_model.joint_names)
+
+ return mesh_dataset, q_js
+
+
+def mask_image(robot_file="ur5e.yml"):
+ save_debug_data = False
+ # create robot segmenter:
+ tensor_args = TensorDeviceType()
+
+ curobo_segmenter = RobotSegmenter.from_robot_file(
+ robot_file, collision_sphere_buffer=0.01, distance_threshold=0.05, use_cuda_graph=True
+ )
+
+ mesh_dataset, q_js = create_render_dataset(robot_file, save_debug_data)
+
+ if save_debug_data:
+ visualize_scale = 10.0
+ data = mesh_dataset[0]
+ cam_obs = CameraObservation(
+ depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+ intrinsics=data["intrinsics"],
+ pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+ )
+ # save depth image
+ imageio.imwrite(
+ "camera_depth.png",
+ (cam_obs.depth_image * visualize_scale)
+ .squeeze()
+ .detach()
+ .cpu()
+ .numpy()
+ .astype(np.uint16),
+ )
+
+ # save robot spheres in current joint configuration
+ robot_kinematics = curobo_segmenter._robot_world.kinematics
+ sph = robot_kinematics.get_robot_as_spheres(q_js.position)
+ WorldConfig(sphere=sph[0]).save_world_as_mesh("robot_spheres.stl")
+
+ # save world pointcloud in robot origin
+
+ pc = cam_obs.get_pointcloud()
+ pc_obs = PointCloud("world", pose=cam_obs.pose.to_list(), points=pc)
+ pc_obs.save_as_mesh("camera_pointcloud.stl", transform_with_pose=True)
+
+ # run segmentation:
+ depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(
+ cam_obs,
+ q_js,
+ )
+ # save robot points as mesh
+
+ robot_mask = cam_obs.clone()
+ robot_mask.depth_image[~depth_mask] = 0.0
+ robot_mesh = PointCloud(
+ "world", pose=robot_mask.pose.to_list(), points=robot_mask.get_pointcloud()
+ )
+ robot_mesh.save_as_mesh("robot_segmented.stl", transform_with_pose=True)
+ # save depth image
+ imageio.imwrite(
+ "robot_depth.png",
+ (robot_mask.depth_image * visualize_scale)
+ .detach()
+ .squeeze()
+ .cpu()
+ .numpy()
+ .astype(np.uint16),
+ )
+
+ # save world points as mesh
+
+ world_mask = cam_obs.clone()
+ world_mask.depth_image[depth_mask] = 0.0
+ world_mesh = PointCloud(
+ "world", pose=world_mask.pose.to_list(), points=world_mask.get_pointcloud()
+ )
+ world_mesh.save_as_mesh("world_segmented.stl", transform_with_pose=True)
+
+ imageio.imwrite(
+ "world_depth.png",
+ (world_mask.depth_image * visualize_scale)
+ .detach()
+ .squeeze()
+ .cpu()
+ .numpy()
+ .astype(np.uint16),
+ )
+
+ dt_list = []
+
+ for i in range(len(mesh_dataset)):
+
+ data = mesh_dataset[i]
+ cam_obs = CameraObservation(
+ depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+ intrinsics=data["intrinsics"],
+ pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+ )
+ if not curobo_segmenter.ready:
+ curobo_segmenter.update_camera_projection(cam_obs)
+ st_time = time.time()
+
+ depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(
+ cam_obs,
+ q_js,
+ )
+
+ torch.cuda.synchronize()
+ dt_list.append(time.time() - st_time)
+
+ print("Segmentation Time (ms), (hz)", np.mean(dt_list[5:]) * 1000.0, 1.0 / np.mean(dt_list[5:]))
+
+
+def profile_mask_image(robot_file="ur5e.yml"):
+ # create robot segmenter:
+ tensor_args = TensorDeviceType()
+
+ curobo_segmenter = RobotSegmenter.from_robot_file(
+ robot_file, collision_sphere_buffer=0.0, distance_threshold=0.05, use_cuda_graph=False
+ )
+
+ mesh_dataset, q_js = create_render_dataset(robot_file)
+
+ dt_list = []
+ data = mesh_dataset[0]
+ cam_obs = CameraObservation(
+ depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+ intrinsics=data["intrinsics"],
+ pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+ )
+ if not curobo_segmenter.ready:
+ curobo_segmenter.update_camera_projection(cam_obs)
+ depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(cam_obs, q_js)
+
+ with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+
+ for i in range(len(mesh_dataset)):
+ with profiler.record_function("get_data"):
+
+ data = mesh_dataset[i]
+ cam_obs = CameraObservation(
+ depth_image=tensor_args.to_device(data["depth"]).unsqueeze(0) * 1000,
+ intrinsics=data["intrinsics"],
+ pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+ )
+ st_time = time.time()
+ with profiler.record_function("segmentation"):
+
+ depth_mask, filtered_image = curobo_segmenter.get_robot_mask_from_active_js(
+ cam_obs, q_js
+ )
+
+ print("Exporting the trace..")
+ prof.export_chrome_trace("segmentation.json")
+
+
+if __name__ == "__main__":
+ mask_image()
+ # profile_mask_image()
diff --git a/setup.py b/setup.py
index bd6a1d7..c875560 100644
--- a/setup.py
+++ b/setup.py
@@ -87,4 +87,6 @@ ext_modules = [
setuptools.setup(
ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension},
+ package_data={"": ["*.so"]},
+ include_package_data=True,
)
diff --git a/src/curobo/__init__.py b/src/curobo/__init__.py
index 58d5ff0..d56871c 100644
--- a/src/curobo/__init__.py
+++ b/src/curobo/__init__.py
@@ -9,7 +9,22 @@
# its affiliates is strictly prohibited.
#
-"""CuRobo package."""
+"""
+cuRobo package is split into several modules:
+
+- :mod:`curobo.opt` contains optimization solvers.
+- :mod:`curobo.cuda_robot_model` contains robot kinematics.
+- :mod:`curobo.curobolib` contains the cuda kernels and python bindings for them.
+- :mod:`curobo.geom` contains geometry processing, collision checking and frame transforms.
+- :mod:`curobo.graph` contains geometric planning with graph search methods.
+- :mod:`curobo.rollout` contains methods that map actions to costs. This class wraps instances of
+ :mod:`curobo.cuda_robot_model` and :mod:`geom` to compute costs given trajectory of actions.
+- :mod:`curobo.util` contains utility methods.
+- :mod:`curobo.wrap` adds the user-level api for task programming. Includes implementation of
+ collision-free reacher and batched robot world collision checking.
+- :mod:`curobo.types` contains custom dataclasses for common data types in robotics, including
+ :meth:`types.state.JointState`, :meth:`types.camera.CameraObservation`, :meth:`types.math.Pose`.
+"""
# NOTE (roflaherty): This is inspired by how matplotlib does creates its version value.
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
index 5fc618d..1821c7a 100644
--- a/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
@@ -172,7 +172,7 @@
-
-
-
-
-
diff --git a/src/curobo/content/configs/robot/franka.yml b/src/curobo/content/configs/robot/franka.yml
index 07ae276..bc3f013 100644
--- a/src/curobo/content/configs/robot/franka.yml
+++ b/src/curobo/content/configs/robot/franka.yml
@@ -13,7 +13,7 @@ robot_cfg:
kinematics:
use_usd_kinematics: False
isaac_usd_path: "/Isaac/Robots/Franka/franka.usd"
- usd_path: "robot/non_shipping/franka/franka_panda_meters.usda"
+ usd_path: "robot/franka/franka_panda.usda"
usd_robot_root: "/panda"
usd_flip_joints: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
"panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
@@ -32,7 +32,7 @@ robot_cfg:
usd_flip_joint_limits: ["panda_finger_joint2"]
urdf_path: "robot/franka_description/franka_panda.urdf"
asset_root_path: "robot/franka_description"
- base_link: "base_link"
+ base_link: "panda_link0"
ee_link: "panda_hand"
collision_link_names:
[
@@ -50,7 +50,7 @@ robot_cfg:
"attached_object",
]
collision_spheres: "spheres/franka_mesh.yml"
- collision_sphere_buffer: 0.0025
+ collision_sphere_buffer: 0.004 # 0.0025
extra_collision_spheres: {"attached_object": 4}
use_global_cumul: True
self_collision_ignore:
diff --git a/src/curobo/content/configs/robot/franka_mobile.yml b/src/curobo/content/configs/robot/franka_mobile.yml
index be13b24..62a4f52 100644
--- a/src/curobo/content/configs/robot/franka_mobile.yml
+++ b/src/curobo/content/configs/robot/franka_mobile.yml
@@ -109,7 +109,7 @@ robot_cfg:
"base_x", "base_y", "base_z",
"panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
"panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
- retract_config: [0,0,0,0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, 0.04]
+ retract_config: [0,0,0,0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, 0.04]
null_space_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
cspace_distance_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
max_acceleration: 15.0 #15.0
diff --git a/src/curobo/content/configs/robot/tm12.yml b/src/curobo/content/configs/robot/tm12.yml
index 01db09c..4d0cf78 100644
--- a/src/curobo/content/configs/robot/tm12.yml
+++ b/src/curobo/content/configs/robot/tm12.yml
@@ -12,7 +12,7 @@
robot_cfg:
kinematics:
use_usd_kinematics: False
- usd_path: "robot/non_shipping/techman/tm12.usd"
+ usd_path: "robot/techman/tm12.usd"
usd_robot_root: "/tm12"
usd_flip_joints:
{
diff --git a/src/curobo/content/configs/robot/ur5e.yml b/src/curobo/content/configs/robot/ur5e.yml
index e2b2be3..689bd9a 100644
--- a/src/curobo/content/configs/robot/ur5e.yml
+++ b/src/curobo/content/configs/robot/ur5e.yml
@@ -49,7 +49,7 @@ robot_cfg:
- "center": [-0.055, 0.008, 0.14]
"radius": 0.07
- "center": [-0.001, -0.002, 0.143]
- "radius": 0.076
+ "radius": 0.08
forearm_link:
- "center": [-0.01, 0.002, 0.031]
"radius": 0.072
diff --git a/src/curobo/content/configs/task/finetune_trajopt.yml b/src/curobo/content/configs/task/finetune_trajopt.yml
index 3d13330..467d86b 100644
--- a/src/curobo/content/configs/task/finetune_trajopt.yml
+++ b/src/curobo/content/configs/task/finetune_trajopt.yml
@@ -31,22 +31,23 @@ model:
cost:
pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
- run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
- weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+ run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
+ weight: [2000,500000.0,30,50]
vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
terminal: True
- run_weight: 0.0 #0.05
+ run_weight: 1.0
use_metric: True
link_pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
- run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+ run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
terminal: True
- run_weight: 0.0
+ run_weight: 0.001
use_metric: True
+
cspace_cfg:
weight: 10000.0
terminal: True
@@ -54,7 +55,7 @@ cost:
bound_cfg:
weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values
- smooth_weight: [0.0,2000.0, 50.0, 0.0] # [vel, acc, jerk,]
+ smooth_weight: [0.0,5000.0, 50.0, 0.0] # [vel, acc, jerk,]
run_weight_velocity: 0.0
run_weight_acceleration: 1.0
run_weight_jerk: 1.0
@@ -78,11 +79,11 @@ cost:
lbfgs:
- n_iters: 400
+ n_iters: 400 # 400
inner_iters: 25
- cold_start_n_iters: 200
+ cold_start_n_iters: null
min_iters: 25
- line_search_scale: [0.01, 0.3,0.7,1.0] # #
+ line_search_scale: [0.01, 0.3, 0.7,1.0] # #
fixed_iters: True
cost_convergence: 0.01
cost_delta_threshold: 1.0
@@ -90,7 +91,7 @@ lbfgs:
epsilon: 0.01
history: 15 #15
use_cuda_graph: True
- n_envs: 1
+ n_problems: 1
store_debug: False
use_cuda_kernel: True
stable_mode: True
diff --git a/src/curobo/content/configs/task/finetune_trajopt_slow.yml b/src/curobo/content/configs/task/finetune_trajopt_slow.yml
new file mode 100644
index 0000000..35f1a35
--- /dev/null
+++ b/src/curobo/content/configs/task/finetune_trajopt_slow.yml
@@ -0,0 +1,109 @@
+
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+ horizon: 32
+ state_filter_cfg:
+ filter_coeff:
+ position: 1.0
+ velocity: 1.0
+ acceleration: 1.0
+ enable: False
+ dt_traj_params:
+ base_dt: 0.2
+ base_ratio: 1.0
+ max_dt: 0.2
+ vel_scale: 1.0
+ control_space: 'POSITION'
+ state_finite_difference_mode: "CENTRAL"
+
+ teleport_mode: False
+ return_full_act_buffer: True
+cost:
+ pose_cfg:
+ vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+ run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+ weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+ vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+ terminal: True
+ run_weight: 0.0 #0.05
+ use_metric: True
+
+ link_pose_cfg:
+ vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+ run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+ weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+ vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+ terminal: True
+ run_weight: 0.001
+ use_metric: True
+
+
+ cspace_cfg:
+ weight: 10000.0
+ terminal: True
+ run_weight: 0.00 #1
+
+ bound_cfg:
+ weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values
+
+ smooth_weight: [0.0,50000.0, 500.0, 0.0] # [vel, acc, jerk,]
+
+ run_weight_velocity: 0.0
+ run_weight_acceleration: 1.0
+ run_weight_jerk: 1.0
+ activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk
+ null_space_weight: [0.0]
+
+ primitive_collision_cfg:
+ weight: 1000000.0
+ use_sweep: True
+ sweep_steps: 6
+ classify: False
+ use_sweep_kernel: True
+ use_speed_metric: True
+ speed_dt: 0.01 # used only for speed metric
+ activation_distance: 0.025
+
+ self_collision_cfg:
+ weight: 5000.0
+ classify: False
+
+
+
+lbfgs:
+ n_iters: 400 # 400
+ inner_iters: 25
+ cold_start_n_iters: 200
+ min_iters: 25
+ line_search_scale: [0.01, 0.3, 0.7,1.0] # #
+ fixed_iters: True
+ cost_convergence: 0.01
+ cost_delta_threshold: 1.0
+ cost_relative_threshold: 0.999 #0.999
+ epsilon: 0.01
+ history: 15 #15
+ use_cuda_graph: True
+ n_problems: 1
+ store_debug: False
+ use_cuda_kernel: True
+ stable_mode: True
+ line_search_type: "approx_wolfe"
+ use_cuda_line_search_kernel: True
+ use_cuda_update_best_kernel: True
+ use_temporal_smooth: False
+ sync_cuda_time: True
+ step_scale: 1.0
+ last_best: 10
+ use_coo_sparse: True
+ debug_info:
+ visual_traj : null #'ee_pos_seq'
diff --git a/src/curobo/content/configs/task/gradient_ik.yml b/src/curobo/content/configs/task/gradient_ik.yml
index 73eaf05..97e17b6 100644
--- a/src/curobo/content/configs/task/gradient_ik.yml
+++ b/src/curobo/content/configs/task/gradient_ik.yml
@@ -32,6 +32,7 @@ cost:
vec_convergence: [0.0, 0.00]
terminal: False
use_metric: True
+ run_weight: 1.0
link_pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
weight: [1000, 20000, 30, 50]
@@ -58,7 +59,7 @@ cost:
lbfgs:
n_iters: 100 #60
inner_iters: 25
- cold_start_n_iters: 100
+ cold_start_n_iters: null
min_iters: 20
line_search_scale: [0.01, 0.3, 0.7, 1.0]
fixed_iters: True
@@ -69,7 +70,7 @@ lbfgs:
history: 6
horizon: 1
use_cuda_graph: True
- n_envs: 1
+ n_problems: 1
store_debug: False
use_cuda_kernel: True
stable_mode: True
diff --git a/src/curobo/content/configs/task/gradient_mpc.yml b/src/curobo/content/configs/task/gradient_mpc.yml
index 66b2093..4de461a 100644
--- a/src/curobo/content/configs/task/gradient_mpc.yml
+++ b/src/curobo/content/configs/task/gradient_mpc.yml
@@ -83,7 +83,7 @@ lbfgs:
epsilon: 0.01
history: 15 #15
use_cuda_graph: True
- n_envs: 1
+ n_problems: 1
store_debug: False
use_cuda_kernel: True
stable_mode: True
diff --git a/src/curobo/content/configs/task/gradient_trajopt.yml b/src/curobo/content/configs/task/gradient_trajopt.yml
index a786659..471d186 100644
--- a/src/curobo/content/configs/task/gradient_trajopt.yml
+++ b/src/curobo/content/configs/task/gradient_trajopt.yml
@@ -31,11 +31,11 @@ model:
cost:
pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
- run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+ run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
weight: [2000,50000.0,30,60] #[150.0, 2000.0, 30, 40]
vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
terminal: True
- run_weight: 0.00
+ run_weight: 1.0
use_metric: True
link_pose_cfg:
@@ -44,7 +44,7 @@ cost:
weight: [2000,50000.0,30,50] #[150.0, 2000.0, 30, 40]
vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
terminal: True
- run_weight: 0.0
+ run_weight: 0.001
use_metric: True
cspace_cfg:
@@ -54,7 +54,9 @@ cost:
bound_cfg:
weight: [5000.0, 50000.0, 50000.0,50000.0] # needs to be 3 values
+ #weight: [000.0, 000.0, 000.0,000.0]
smooth_weight: [0.0,10000.0,10.0, 0.0] # [vel, acc, jerk, alpha_v-not-used]
+ #smooth_weight: [0.0,0000.0,0.0, 0.0] # [vel, acc, jerk, alpha_v-not-used]
run_weight_velocity: 0.00
run_weight_acceleration: 1.0
run_weight_jerk: 1.0
@@ -79,9 +81,9 @@ cost:
lbfgs:
- n_iters: 175 #175
+ n_iters: 125 #175
inner_iters: 25
- cold_start_n_iters: 150
+ cold_start_n_iters: null
min_iters: 25
line_search_scale: [0.01,0.3,0.7,1.0] #[0.01,0.2, 0.3,0.5,0.7,0.9, 1.0] #
fixed_iters: True
@@ -91,7 +93,7 @@ lbfgs:
epsilon: 0.01
history: 15
use_cuda_graph: True
- n_envs: 1
+ n_problems: 1
store_debug: False
use_cuda_kernel: True
stable_mode: True
diff --git a/src/curobo/content/configs/task/particle_ik.yml b/src/curobo/content/configs/task/particle_ik.yml
index 67b7ba7..903e549 100644
--- a/src/curobo/content/configs/task/particle_ik.yml
+++ b/src/curobo/content/configs/task/particle_ik.yml
@@ -31,6 +31,7 @@ cost:
vec_convergence: [0.00, 0.000] # orientation, position
terminal: False
use_metric: True
+ run_weight: 1.0
link_pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
weight: [30, 50, 10, 10] #[20.0, 100.0]
@@ -56,7 +57,7 @@ mppi:
init_cov : 1.0 #0.15 #.5 #.5
gamma : 1.0
n_iters : 4
- cold_start_n_iters: 4
+ cold_start_n_iters: null
step_size_mean : 0.9
step_size_cov : 0.1
beta : 0.01
@@ -69,12 +70,12 @@ mppi:
sample_mode : 'BEST'
base_action : 'REPEAT'
squash_fn : 'CLAMP'
- n_envs : 1
+ n_problems : 1
use_cuda_graph : True
seed : 0
store_debug : False
random_mean : False
- sample_per_env : True
+ sample_per_problem: True
sync_cuda_time : False
use_coo_sparse : True
sample_params:
diff --git a/src/curobo/content/configs/task/particle_mpc.yml b/src/curobo/content/configs/task/particle_mpc.yml
index eec1adf..2b5b7e8 100644
--- a/src/curobo/content/configs/task/particle_mpc.yml
+++ b/src/curobo/content/configs/task/particle_mpc.yml
@@ -89,12 +89,12 @@ mppi:
sample_mode : 'BEST'
base_action : 'REPEAT'
squash_fn : 'CLAMP'
- n_envs : 1
+ n_problems : 1
use_cuda_graph : True
seed : 0
store_debug : False
random_mean : True
- sample_per_env : False
+ sample_per_problem: False
sync_cuda_time : True
use_coo_sparse : True
sample_params:
diff --git a/src/curobo/content/configs/task/particle_trajopt.yml b/src/curobo/content/configs/task/particle_trajopt.yml
index 4183c6b..1c76623 100644
--- a/src/curobo/content/configs/task/particle_trajopt.yml
+++ b/src/curobo/content/configs/task/particle_trajopt.yml
@@ -30,17 +30,17 @@ model:
cost:
pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
- run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight
+ run_vec_weight: [0.0,0.0,0.0,0.0,0.0,0.0] # running weight
weight: [250.0, 5000.0, 20, 20]
vec_convergence: [0.0,0.0,1000.0,1000.0]
terminal: True
- run_weight: 0.00
+ run_weight: 1.0
use_metric: True
link_pose_cfg:
vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
- run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight
- weight: [250.0, 5000.0, 40, 40]
+ run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
+ weight: [0.0, 5000.0, 40, 40]
vec_convergence: [0.0,0.0,1000.0,1000.0]
terminal: True
run_weight: 0.00
@@ -54,10 +54,12 @@ cost:
bound_cfg:
weight: [0.1, 0.1,0.0,0.0]
activation_distance: [0.0,0.0,0.0,0.0] #-#0.01
+ #smooth_weight: [0.0,100.0,1.0,0.0]
smooth_weight: [0.0,20.0,0.0,0.0]
run_weight_velocity: 0.0
run_weight_acceleration: 1.0
run_weight_jerk: 1.0
+ null_space_weight: [0.00]
primitive_collision_cfg:
weight: 5000.0
@@ -80,7 +82,7 @@ mppi:
init_cov : 0.5
gamma : 1.0
n_iters : 2
- cold_start_n_iters: 2
+ cold_start_n_iters: null
step_size_mean : 0.9
step_size_cov : 0.01
beta : 0.01
@@ -93,12 +95,12 @@ mppi:
sample_mode : 'BEST'
base_action : 'REPEAT'
squash_fn : 'CLAMP'
- n_envs : 1
+ n_problems : 1
use_cuda_graph : True
seed : 0
store_debug : False
random_mean : True
- sample_per_env : True
+ sample_per_problem: True
sync_cuda_time : False
use_coo_sparse : True
sample_params:
diff --git a/src/curobo/content/configs/world/collision_table.yml b/src/curobo/content/configs/world/collision_table.yml
index 169a95f..526b550 100644
--- a/src/curobo/content/configs/world/collision_table.yml
+++ b/src/curobo/content/configs/world/collision_table.yml
@@ -11,4 +11,4 @@
cuboid:
table:
dims: [5.0, 5.0, 0.2] # x, y, z
- pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+ pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
\ No newline at end of file
diff --git a/src/curobo/cuda_robot_model/cuda_robot_generator.py b/src/curobo/cuda_robot_model/cuda_robot_generator.py
index d105348..60dbea8 100644
--- a/src/curobo/cuda_robot_model/cuda_robot_generator.py
+++ b/src/curobo/cuda_robot_model/cuda_robot_generator.py
@@ -145,6 +145,9 @@ class CudaRobotGeneratorConfig:
#: cspace config
cspace: Union[None, CSpaceConfig, Dict[str, List[Any]]] = None
+ #: Enable loading meshes from kinematics parser.
+ load_meshes: bool = False
+
def __post_init__(self):
# add root path:
# Check if an external asset path is provided:
@@ -283,7 +286,10 @@ class CudaRobotGenerator(CudaRobotGeneratorConfig):
)
else:
self._kinematics_parser = UrdfKinematicsParser(
- self.urdf_path, mesh_root=self.asset_root_path, extra_links=self.extra_links
+ self.urdf_path,
+ mesh_root=self.asset_root_path,
+ extra_links=self.extra_links,
+ load_meshes=self.load_meshes,
)
if self.lock_joints is None:
diff --git a/src/curobo/cuda_robot_model/cuda_robot_model.py b/src/curobo/cuda_robot_model/cuda_robot_model.py
index 78f4370..6f5a45a 100644
--- a/src/curobo/cuda_robot_model/cuda_robot_model.py
+++ b/src/curobo/cuda_robot_model/cuda_robot_model.py
@@ -30,7 +30,7 @@ from curobo.cuda_robot_model.types import (
SelfCollisionKinematicsConfig,
)
from curobo.curobolib.kinematics import get_cuda_kinematics
-from curobo.geom.types import Sphere
+from curobo.geom.types import Mesh, Sphere
from curobo.types.base import TensorDeviceType
from curobo.types.math import Pose
from curobo.util.logger import log_error
@@ -390,4 +390,8 @@ class CudaRobotModel(CudaRobotModelConfig):
return self.kinematics_config.base_link
def update_kinematics_config(self, new_kin_config: KinematicsTensorConfig):
- self.kinematics_config.copy(new_kin_config)
+ self.kinematics_config.copy_(new_kin_config)
+
+ @property
+ def retract_config(self):
+ return self.kinematics_config.cspace.retract_config
diff --git a/src/curobo/cuda_robot_model/types.py b/src/curobo/cuda_robot_model/types.py
index a620457..9c782ad 100644
--- a/src/curobo/cuda_robot_model/types.py
+++ b/src/curobo/cuda_robot_model/types.py
@@ -23,7 +23,7 @@ from curobo.types.base import TensorDeviceType
from curobo.types.math import Pose
from curobo.types.state import JointState
from curobo.types.tensor import T_DOF
-from curobo.util.tensor_util import copy_if_not_none
+from curobo.util.tensor_util import clone_if_not_none, copy_if_not_none
class JointType(Enum):
@@ -75,20 +75,28 @@ class JointLimits:
self.tensor_args,
)
+ def copy_(self, new_jl: JointLimits):
+ self.joint_names = new_jl.joint_names.copy()
+ self.position.copy_(new_jl.position)
+ self.velocity.copy_(new_jl.velocity)
+ self.acceleration.copy_(new_jl.acceleration)
+ self.effort = copy_if_not_none(new_jl.effort, self.effort)
+ return self
+
@dataclass
class CSpaceConfig:
joint_names: List[str]
retract_config: Optional[T_DOF] = None
cspace_distance_weight: Optional[T_DOF] = None
- null_space_weight: Optional[T_DOF] = None # List[float]
+ null_space_weight: Optional[T_DOF] = None
tensor_args: TensorDeviceType = TensorDeviceType()
max_acceleration: Union[float, List[float]] = 10.0
max_jerk: Union[float, List[float]] = 500.0
velocity_scale: Union[float, List[float]] = 1.0
acceleration_scale: Union[float, List[float]] = 1.0
jerk_scale: Union[float, List[float]] = 1.0
- position_limit_clip: Union[float, List[float]] = 0.01
+ position_limit_clip: Union[float, List[float]] = 0.0
def __post_init__(self):
if self.retract_config is not None:
@@ -147,12 +155,31 @@ class CSpaceConfig:
joint_names = [self.joint_names[n] for n in new_index]
self.joint_names = joint_names
+ def copy_(self, new_config: CSpaceConfig):
+ self.joint_names = new_config.joint_names.copy()
+ self.retract_config = copy_if_not_none(new_config.retract_config, self.retract_config)
+ self.null_space_weight = copy_if_not_none(
+ new_config.null_space_weight, self.null_space_weight
+ )
+ self.cspace_distance_weight = copy_if_not_none(
+ new_config.cspace_distance_weight, self.cspace_distance_weight
+ )
+ self.tensor_args = self.tensor_args
+ self.max_jerk = copy_if_not_none(new_config.max_jerk, self.max_jerk)
+ self.max_acceleration = copy_if_not_none(new_config.max_acceleration, self.max_acceleration)
+ self.velocity_scale = copy_if_not_none(new_config.velocity_scale, self.velocity_scale)
+ self.acceleration_scale = copy_if_not_none(
+ new_config.acceleration_scale, self.acceleration_scale
+ )
+ self.jerk_scale = copy_if_not_none(new_config.jerk_scale, self.jerk_scale)
+ return self
+
def clone(self) -> CSpaceConfig:
return CSpaceConfig(
joint_names=self.joint_names.copy(),
- retract_config=copy_if_not_none(self.retract_config),
- null_space_weight=copy_if_not_none(self.null_space_weight),
- cspace_distance_weight=copy_if_not_none(self.cspace_distance_weight),
+ retract_config=clone_if_not_none(self.retract_config),
+ null_space_weight=clone_if_not_none(self.null_space_weight),
+ cspace_distance_weight=clone_if_not_none(self.cspace_distance_weight),
tensor_args=self.tensor_args,
max_jerk=self.max_jerk.clone(),
max_acceleration=self.max_acceleration.clone(),
@@ -215,12 +242,48 @@ class KinematicsTensorConfig:
cspace: Optional[CSpaceConfig] = None
base_link: str = "base_link"
ee_link: str = "ee_link"
+ reference_link_spheres: Optional[torch.Tensor] = None
def __post_init__(self):
if self.cspace is None and self.joint_limits is not None:
self.load_cspace_cfg_from_kinematics()
if self.joint_limits is not None and self.cspace is not None:
self.joint_limits = self.cspace.scale_joint_limits(self.joint_limits)
+ if self.link_spheres is not None and self.reference_link_spheres is None:
+ self.reference_link_spheres = self.link_spheres.clone()
+
+ def copy_(self, new_config: KinematicsTensorConfig):
+ self.fixed_transforms.copy_(new_config.fixed_transforms)
+ self.link_map.copy_(new_config.link_map)
+ self.joint_map.copy_(new_config.joint_map)
+ self.joint_map_type.copy_(new_config.joint_map_type)
+ self.store_link_map.copy_(new_config.store_link_map)
+ self.link_chain_map.copy_(new_config.link_chain_map)
+ self.joint_limits.copy_(new_config.joint_limits)
+ if new_config.link_spheres is not None and self.link_spheres is not None:
+ self.link_spheres.copy_(new_config.link_spheres)
+ if new_config.link_sphere_idx_map is not None and self.link_sphere_idx_map is not None:
+ self.link_sphere_idx_map.copy_(new_config.link_sphere_idx_map)
+ if new_config.link_name_to_idx_map is not None and self.link_name_to_idx_map is not None:
+ self.link_name_to_idx_map = new_config.link_name_to_idx_map.copy()
+ if (
+ new_config.reference_link_spheres is not None
+ and self.reference_link_spheres is not None
+ ):
+ self.reference_link_spheres.copy_(new_config.reference_link_spheres)
+ self.base_link = new_config.base_link
+ self.ee_idx = new_config.ee_idx
+ self.ee_link = new_config.ee_link
+ self.debug = new_config.debug
+ self.cspace.copy_(new_config.cspace)
+ self.n_dof = new_config.n_dof
+ self.non_fixed_joint_names = new_config.non_fixed_joint_names
+ self.joint_names = new_config.joint_names
+ self.link_names = new_config.link_names
+ self.mesh_link_names = new_config.mesh_link_names
+ self.total_spheres = new_config.total_spheres
+
+ return self
def load_cspace_cfg_from_kinematics(self):
retract_config = (
@@ -270,6 +333,13 @@ class KinematicsTensorConfig:
link_sphere_index = self.get_sphere_index_from_link_name(link_name)
return self.link_spheres[link_sphere_index, :]
+ def get_reference_link_spheres(
+ self,
+ link_name: str,
+ ):
+ link_sphere_index = self.get_sphere_index_from_link_name(link_name)
+ return self.reference_link_spheres[link_sphere_index, :]
+
def attach_object(
self,
sphere_radius: Optional[float] = None,
@@ -332,6 +402,19 @@ class KinematicsTensorConfig:
"""
return self.get_link_spheres(link_name).shape[0]
+ def disable_link_spheres(self, link_name: str):
+ if link_name not in self.link_name_to_idx_map.keys():
+ raise ValueError(link_name + " not found in spheres")
+ curr_spheres = self.get_link_spheres(link_name)
+ curr_spheres[:, 3] = -100.0
+ self.update_link_spheres(link_name, curr_spheres)
+
+ def enable_link_spheres(self, link_name: str):
+ if link_name not in self.link_name_to_idx_map.keys():
+ raise ValueError(link_name + " not found in spheres")
+ curr_spheres = self.get_reference_link_spheres(link_name)
+ self.update_link_spheres(link_name, curr_spheres)
+
@dataclass
class SelfCollisionKinematicsConfig:
diff --git a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
index aa5b6e3..e116f68 100644
--- a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
+++ b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
@@ -183,6 +183,7 @@ class UrdfKinematicsParser(KinematicsParser):
else:
# convert to list:
mesh_pose = Pose.from_matrix(mesh_pose).to_list()
+
return CuroboMesh(
name=link_name,
pose=mesh_pose,
diff --git a/src/curobo/cuda_robot_model/usd_kinematics_parser.py b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
index d9dc978..2715b0c 100644
--- a/src/curobo/cuda_robot_model/usd_kinematics_parser.py
+++ b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
@@ -13,7 +13,6 @@ from typing import Dict, List, Optional, Tuple
# Third Party
import numpy as np
-from pxr import Usd, UsdPhysics
# CuRobo
from curobo.cuda_robot_model.kinematics_parser import KinematicsParser, LinkParams
@@ -22,6 +21,15 @@ from curobo.types.base import TensorDeviceType
from curobo.types.math import Pose
from curobo.util.logger import log_error
+try:
+ # Third Party
+ from pxr import Usd, UsdPhysics
+except ImportError:
+ raise ImportError(
+ "usd-core failed to import, install with pip install usd-core"
+ + " NOTE: Do not install this if using with ISAAC SIM."
+ )
+
class UsdKinematicsParser(KinematicsParser):
"""An experimental kinematics parser from USD.
diff --git a/src/curobo/curobolib/cpp/geom_cuda.cpp b/src/curobo/curobolib/cpp/geom_cuda.cpp
index cd69a6f..3775edb 100644
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -16,43 +16,55 @@
// CUDA forward declarations
-std::vector self_collision_distance(
- torch::Tensor out_distance, torch::Tensor out_vec,
- torch::Tensor sparse_index,
- const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
- const torch::Tensor collision_offset, // n_spheres x n_spheres
- const torch::Tensor weight,
- const torch::Tensor collision_matrix, // n_spheres x n_spheres
- const torch::Tensor thread_locations, const int locations_size,
- const int batch_size, const int nspheres, const bool compute_grad = false,
- const int ndpt = 8, // Does this need to match template?
- const bool debug = false);
+std::vectorself_collision_distance(
+ torch::Tensor out_distance,
+ torch::Tensor out_vec,
+ torch::Tensor sparse_index,
+ const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
+ const torch::Tensor collision_offset, // n_spheres x n_spheres
+ const torch::Tensor weight,
+ const torch::Tensor collision_matrix, // n_spheres x n_spheres
+ const torch::Tensor thread_locations,
+ const int locations_size,
+ const int batch_size,
+ const int nspheres,
+ const bool compute_grad = false,
+ const int ndpt = 8, // Does this need to match template?
+ const bool debug = false);
// CUDA forward declarations
-std::vector swept_sphere_obb_clpt(
- const torch::Tensor sphere_position, // batch_size, 3
- torch::Tensor distance, // batch_size, 1
- torch::Tensor
- closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
- torch::Tensor sparsity_idx, const torch::Tensor weight,
- const torch::Tensor activation_distance, const torch::Tensor speed_dt,
- const torch::Tensor obb_accel, // n_boxes, 4, 4
- const torch::Tensor obb_bounds, // n_boxes, 3
- const torch::Tensor obb_pose, // n_boxes, 4, 4
- const torch::Tensor obb_enable, // n_boxes, 4,
- const torch::Tensor n_env_obb, // n_boxes, 4, 4
- const torch::Tensor env_query_idx, // n_boxes, 4, 4
- const int max_nobs, const int batch_size, const int horizon,
- const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
- const bool transform_back, const bool compute_distance,
- const bool use_batch_env);
+std::vectorswept_sphere_obb_clpt(
+ const torch::Tensor sphere_position, // batch_size, 3
+ torch::Tensor distance, // batch_size, 1
+ torch::Tensor
+ closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
+ torch::Tensor sparsity_idx,
+ const torch::Tensor weight,
+ const torch::Tensor activation_distance,
+ const torch::Tensor speed_dt,
+ const torch::Tensor obb_accel, // n_boxes, 4, 4
+ const torch::Tensor obb_bounds, // n_boxes, 3
+ const torch::Tensor obb_pose, // n_boxes, 4, 4
+ const torch::Tensor obb_enable, // n_boxes, 4,
+ const torch::Tensor n_env_obb, // n_boxes, 4, 4
+ const torch::Tensor env_query_idx, // n_boxes, 4, 4
+ const int max_nobs,
+ const int batch_size,
+ const int horizon,
+ const int n_spheres,
+ const int sweep_steps,
+ const bool enable_speed_metric,
+ const bool transform_back,
+ const bool compute_distance,
+ const bool use_batch_env);
std::vector
sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
- torch::Tensor distance,
- torch::Tensor closest_point, // batch size, 3
- torch::Tensor sparsity_idx, const torch::Tensor weight,
+ torch::Tensor distance,
+ torch::Tensor closest_point, // batch size, 3
+ torch::Tensor sparsity_idx,
+ const torch::Tensor weight,
const torch::Tensor activation_distance,
const torch::Tensor obb_accel, // n_boxes, 4, 4
const torch::Tensor obb_bounds, // n_boxes, 3
@@ -60,58 +72,75 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
const torch::Tensor obb_enable, // n_boxes, 4, 4
const torch::Tensor n_env_obb, // n_boxes, 4, 4
const torch::Tensor env_query_idx, // n_boxes, 4, 4
- const int max_nobs, const int batch_size, const int horizon,
- const int n_spheres, const bool transform_back,
- const bool compute_distance, const bool use_batch_env);
+ const int max_nobs,
+ const int batch_size,
+ const int horizon,
+ const int n_spheres,
+ const bool transform_back,
+ const bool compute_distance,
+ const bool use_batch_env);
-std::vector pose_distance(
- torch::Tensor out_distance, torch::Tensor out_position_distance,
- torch::Tensor out_rotation_distance,
- torch::Tensor distance_p_vector, // batch size, 3
- torch::Tensor distance_q_vector, // batch size, 4
- torch::Tensor out_gidx,
- const torch::Tensor current_position, // batch_size, 3
- const torch::Tensor goal_position, // n_boxes, 3
- const torch::Tensor current_quat, const torch::Tensor goal_quat,
- const torch::Tensor vec_weight, // n_boxes, 4, 4
- const torch::Tensor weight, // n_boxes, 4, 4
- const torch::Tensor vec_convergence, const torch::Tensor run_weight,
- const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
- const int batch_size, const int horizon, const int mode,
- const int num_goals = 1, const bool compute_grad = false,
- const bool write_distance = true, const bool use_metric = false);
+std::vectorpose_distance(
+ torch::Tensor out_distance,
+ torch::Tensor out_position_distance,
+ torch::Tensor out_rotation_distance,
+ torch::Tensor distance_p_vector, // batch size, 3
+ torch::Tensor distance_q_vector, // batch size, 4
+ torch::Tensor out_gidx,
+ const torch::Tensor current_position, // batch_size, 3
+ const torch::Tensor goal_position, // n_boxes, 3
+ const torch::Tensor current_quat,
+ const torch::Tensor goal_quat,
+ const torch::Tensor vec_weight, // n_boxes, 4, 4
+ const torch::Tensor weight, // n_boxes, 4, 4
+ const torch::Tensor vec_convergence,
+ const torch::Tensor run_weight,
+ const torch::Tensor run_vec_weight,
+ const torch::Tensor offset_waypoint,
+ const torch::Tensor offset_tstep_fraction,
+ const torch::Tensor batch_pose_idx,
+ const int batch_size,
+ const int horizon,
+ const int mode,
+ const int num_goals = 1,
+ const bool compute_grad = false,
+ const bool write_distance = true,
+ const bool use_metric = false,
+ const bool project_distance = true);
std::vector
-backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+backward_pose_distance(torch::Tensor out_grad_p,
+ torch::Tensor out_grad_q,
const torch::Tensor grad_distance, // batch_size, 3
const torch::Tensor grad_p_distance, // n_boxes, 3
const torch::Tensor grad_q_distance,
const torch::Tensor pose_weight,
- const torch::Tensor grad_p_vec, // n_boxes, 4, 4
- const torch::Tensor grad_q_vec, const int batch_size,
- const bool use_distance = false);
+ const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+ const torch::Tensor grad_q_vec,
+ const int batch_size,
+ const bool use_distance = false);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
- AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
- CHECK_CUDA(x); \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+ AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+ CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
-std::vector self_collision_distance_wrapper(
- torch::Tensor out_distance, torch::Tensor out_vec,
- torch::Tensor sparse_index,
- const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
- const torch::Tensor collision_offset, // n_spheres
- const torch::Tensor weight,
- const torch::Tensor collision_matrix, // n_spheres
- const torch::Tensor thread_locations, const int thread_locations_size,
- const int batch_size, const int nspheres, const bool compute_grad = false,
- const int ndpt = 8, const bool debug = false) {
-
+std::vectorself_collision_distance_wrapper(
+ torch::Tensor out_distance, torch::Tensor out_vec,
+ torch::Tensor sparse_index,
+ const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
+ const torch::Tensor collision_offset, // n_spheres
+ const torch::Tensor weight,
+ const torch::Tensor collision_matrix, // n_spheres
+ const torch::Tensor thread_locations, const int thread_locations_size,
+ const int batch_size, const int nspheres, const bool compute_grad = false,
+ const int ndpt = 8, const bool debug = false)
+{
CHECK_INPUT(out_distance);
CHECK_INPUT(out_vec);
CHECK_INPUT(robot_spheres);
@@ -123,29 +152,30 @@ std::vector self_collision_distance_wrapper(
const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());
return self_collision_distance(
- out_distance, out_vec, sparse_index, robot_spheres,
- collision_offset, weight, collision_matrix, thread_locations,
- thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
+ out_distance, out_vec, sparse_index, robot_spheres,
+ collision_offset, weight, collision_matrix, thread_locations,
+ thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
}
-std::vector sphere_obb_clpt_wrapper(
- const torch::Tensor sphere_position, // batch_size, 4
-
- torch::Tensor distance,
- torch::Tensor closest_point, // batch size, 3
- torch::Tensor sparsity_idx, const torch::Tensor weight,
- const torch::Tensor activation_distance,
- const torch::Tensor obb_accel, // n_boxes, 4, 4
- const torch::Tensor obb_bounds, // n_boxes, 3
- const torch::Tensor obb_pose, // n_boxes, 4, 4
- const torch::Tensor obb_enable, // n_boxes, 4, 4
- const torch::Tensor n_env_obb, // n_boxes, 4, 4
- const torch::Tensor env_query_idx, // n_boxes, 4, 4
- const int max_nobs, const int batch_size, const int horizon,
- const int n_spheres, const bool transform_back, const bool compute_distance,
- const bool use_batch_env) {
+std::vectorsphere_obb_clpt_wrapper(
+ const torch::Tensor sphere_position, // batch_size, 4
+ torch::Tensor distance,
+ torch::Tensor closest_point, // batch size, 3
+ torch::Tensor sparsity_idx, const torch::Tensor weight,
+ const torch::Tensor activation_distance,
+ const torch::Tensor obb_accel, // n_boxes, 4, 4
+ const torch::Tensor obb_bounds, // n_boxes, 3
+ const torch::Tensor obb_pose, // n_boxes, 4, 4
+ const torch::Tensor obb_enable, // n_boxes, 4, 4
+ const torch::Tensor n_env_obb, // n_boxes, 4, 4
+ const torch::Tensor env_query_idx, // n_boxes, 4, 4
+ const int max_nobs, const int batch_size, const int horizon,
+ const int n_spheres, const bool transform_back, const bool compute_distance,
+ const bool use_batch_env)
+{
const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
CHECK_INPUT(distance);
CHECK_INPUT(closest_point);
CHECK_INPUT(sphere_position);
@@ -154,55 +184,61 @@ std::vector sphere_obb_clpt_wrapper(
CHECK_INPUT(activation_distance);
CHECK_INPUT(obb_accel);
return sphere_obb_clpt(
- sphere_position, distance, closest_point, sparsity_idx, weight,
- activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
- n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
- transform_back, compute_distance, use_batch_env);
+ sphere_position, distance, closest_point, sparsity_idx, weight,
+ activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+ n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
+ transform_back, compute_distance, use_batch_env);
}
-std::vector swept_sphere_obb_clpt_wrapper(
- const torch::Tensor sphere_position, // batch_size, 4
- torch::Tensor distance,
- torch::Tensor closest_point, // batch size, 3
- torch::Tensor sparsity_idx, const torch::Tensor weight,
- const torch::Tensor activation_distance, const torch::Tensor speed_dt,
- const torch::Tensor obb_accel, // n_boxes, 4, 4
- const torch::Tensor obb_bounds, // n_boxes, 3
- const torch::Tensor obb_pose, // n_boxes, 4, 4
- const torch::Tensor obb_enable, // n_boxes, 4, 4
- const torch::Tensor n_env_obb, // n_boxes, 4, 4
- const torch::Tensor env_query_idx, // n_boxes, 4, 4
- const int max_nobs, const int batch_size, const int horizon,
- const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
- const bool transform_back, const bool compute_distance,
- const bool use_batch_env) {
+
+std::vectorswept_sphere_obb_clpt_wrapper(
+ const torch::Tensor sphere_position, // batch_size, 4
+ torch::Tensor distance,
+ torch::Tensor closest_point, // batch size, 3
+ torch::Tensor sparsity_idx, const torch::Tensor weight,
+ const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+ const torch::Tensor obb_accel, // n_boxes, 4, 4
+ const torch::Tensor obb_bounds, // n_boxes, 3
+ const torch::Tensor obb_pose, // n_boxes, 4, 4
+ const torch::Tensor obb_enable, // n_boxes, 4, 4
+ const torch::Tensor n_env_obb, // n_boxes, 4, 4
+ const torch::Tensor env_query_idx, // n_boxes, 4, 4
+ const int max_nobs, const int batch_size, const int horizon,
+ const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+ const bool transform_back, const bool compute_distance,
+ const bool use_batch_env)
+{
const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
CHECK_INPUT(distance);
CHECK_INPUT(closest_point);
CHECK_INPUT(sphere_position);
return swept_sphere_obb_clpt(
- sphere_position,
- distance, closest_point, sparsity_idx, weight, activation_distance,
- speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
- env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
- enable_speed_metric, transform_back, compute_distance, use_batch_env);
+ sphere_position,
+ distance, closest_point, sparsity_idx, weight, activation_distance,
+ speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
+ env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
+ enable_speed_metric, transform_back, compute_distance, use_batch_env);
}
-std::vector pose_distance_wrapper(
- torch::Tensor out_distance, torch::Tensor out_position_distance,
- torch::Tensor out_rotation_distance,
- torch::Tensor distance_p_vector, // batch size, 3
- torch::Tensor distance_q_vector, // batch size, 4
- torch::Tensor out_gidx,
- const torch::Tensor current_position, // batch_size, 3
- const torch::Tensor goal_position, // n_boxes, 3
- const torch::Tensor current_quat, const torch::Tensor goal_quat,
- const torch::Tensor vec_weight, // n_boxes, 4, 4
- const torch::Tensor weight, const torch::Tensor vec_convergence,
- const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
- const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
- const int mode, const int num_goals = 1, const bool compute_grad = false,
- const bool write_distance = false, const bool use_metric = false) {
+std::vectorpose_distance_wrapper(
+ torch::Tensor out_distance, torch::Tensor out_position_distance,
+ torch::Tensor out_rotation_distance,
+ torch::Tensor distance_p_vector, // batch size, 3
+ torch::Tensor distance_q_vector, // batch size, 4
+ torch::Tensor out_gidx,
+ const torch::Tensor current_position, // batch_size, 3
+ const torch::Tensor goal_position, // n_boxes, 3
+ const torch::Tensor current_quat, const torch::Tensor goal_quat,
+ const torch::Tensor vec_weight, // n_boxes, 4, 4
+ const torch::Tensor weight, const torch::Tensor vec_convergence,
+ const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
+ const torch::Tensor offset_waypoint, const torch::Tensor offset_tstep_fraction,
+ const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
+ const int mode, const int num_goals = 1, const bool compute_grad = false,
+ const bool write_distance = false, const bool use_metric = false,
+ const bool project_distance = true)
+{
// at::cuda::DeviceGuard guard(angle.device());
CHECK_INPUT(out_distance);
CHECK_INPUT(out_position_distance);
@@ -214,24 +250,30 @@ std::vector pose_distance_wrapper(
CHECK_INPUT(current_quat);
CHECK_INPUT(goal_quat);
CHECK_INPUT(batch_pose_idx);
+ CHECK_INPUT(offset_waypoint);
+ CHECK_INPUT(offset_tstep_fraction);
const at::cuda::OptionalCUDAGuard guard(current_position.device());
return pose_distance(
- out_distance, out_position_distance, out_rotation_distance,
- distance_p_vector, distance_q_vector, out_gidx, current_position,
- goal_position, current_quat, goal_quat, vec_weight, weight,
- vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
- horizon, mode, num_goals, compute_grad, write_distance, use_metric);
+ out_distance, out_position_distance, out_rotation_distance,
+ distance_p_vector, distance_q_vector, out_gidx, current_position,
+ goal_position, current_quat, goal_quat, vec_weight, weight,
+ vec_convergence, run_weight, run_vec_weight,
+ offset_waypoint,
+ offset_tstep_fraction,
+ batch_pose_idx, batch_size,
+ horizon, mode, num_goals, compute_grad, write_distance, use_metric, project_distance);
}
-std::vector backward_pose_distance_wrapper(
- torch::Tensor out_grad_p, torch::Tensor out_grad_q,
- const torch::Tensor grad_distance, // batch_size, 3
- const torch::Tensor grad_p_distance, // n_boxes, 3
- const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
- const torch::Tensor grad_p_vec, // n_boxes, 4, 4
- const torch::Tensor grad_q_vec, const int batch_size,
- const bool use_distance) {
+std::vectorbackward_pose_distance_wrapper(
+ torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+ const torch::Tensor grad_distance, // batch_size, 3
+ const torch::Tensor grad_p_distance, // n_boxes, 3
+ const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
+ const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+ const torch::Tensor grad_q_vec, const int batch_size,
+ const bool use_distance)
+{
CHECK_INPUT(out_grad_p);
CHECK_INPUT(out_grad_q);
CHECK_INPUT(grad_distance);
@@ -241,18 +283,19 @@ std::vector backward_pose_distance_wrapper(
const at::cuda::OptionalCUDAGuard guard(grad_distance.device());
return backward_pose_distance(
- out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
- pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
+ out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
+ pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
- m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
- m.def("pose_distance_backward", &backward_pose_distance_wrapper,
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+ m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
+ m.def("pose_distance_backward", &backward_pose_distance_wrapper,
"Pose Distance Backward (curobolib)");
- m.def("closest_point", &sphere_obb_clpt_wrapper,
+ m.def("closest_point", &sphere_obb_clpt_wrapper,
"Closest Point OBB(curobolib)");
- m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
+ m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
"Swept Closest Point OBB(curobolib)");
m.def("self_collision_distance", &self_collision_distance_wrapper,
diff --git a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
index c73bff9..b7dda16 100644
--- a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
+++ b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
@@ -16,91 +16,112 @@
// CUDA forward declarations
std::vector
-matrix_to_quaternion(torch::Tensor out_quat,
+matrix_to_quaternion(torch::Tensor out_quat,
const torch::Tensor in_rot // batch_size, 3
-);
+ );
-std::vector kin_fused_forward(
- torch::Tensor link_pos, torch::Tensor link_quat,
- torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
- const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
- const torch::Tensor robot_spheres, const torch::Tensor link_map,
- const torch::Tensor joint_map, const torch::Tensor joint_map_type,
- const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
- const int batch_size, const int n_spheres,
- const bool use_global_cumul = false);
+std::vectorkin_fused_forward(
+ torch::Tensor link_pos,
+ torch::Tensor link_quat,
+ torch::Tensor batch_robot_spheres,
+ torch::Tensor global_cumul_mat,
+ const torch::Tensor joint_vec,
+ const torch::Tensor fixed_transform,
+ const torch::Tensor robot_spheres,
+ const torch::Tensor link_map,
+ const torch::Tensor joint_map,
+ const torch::Tensor joint_map_type,
+ const torch::Tensor store_link_map,
+ const torch::Tensor link_sphere_map,
+ const int batch_size,
+ const int n_spheres,
+ const bool use_global_cumul = false);
+
+std::vectorkin_fused_backward_16t(
+ torch::Tensor grad_out,
+ const torch::Tensor grad_nlinks_pos,
+ const torch::Tensor grad_nlinks_quat,
+ const torch::Tensor grad_spheres,
+ const torch::Tensor global_cumul_mat,
+ const torch::Tensor joint_vec,
+ const torch::Tensor fixed_transform,
+ const torch::Tensor robot_spheres,
+ const torch::Tensor link_map,
+ const torch::Tensor joint_map,
+ const torch::Tensor joint_map_type,
+ const torch::Tensor store_link_map,
+ const torch::Tensor link_sphere_map,
+ const torch::Tensor link_chain_map,
+ const int batch_size,
+ const int n_spheres,
+ const bool sparsity_opt = true,
+ const bool use_global_cumul = false);
-std::vector kin_fused_backward_16t(
- torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
- const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
- const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
- const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
- const torch::Tensor link_map, const torch::Tensor joint_map,
- const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
- const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
- const int batch_size, const int n_spheres, const bool sparsity_opt = true,
- const bool use_global_cumul = false);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
- AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
- CHECK_CUDA(x); \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+ AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+ CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
-std::vector kin_forward_wrapper(
- torch::Tensor link_pos, torch::Tensor link_quat,
- torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
- const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
- const torch::Tensor robot_spheres, const torch::Tensor link_map,
- const torch::Tensor joint_map, const torch::Tensor joint_map_type,
- const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
- const int batch_size, const int n_spheres,
- const bool use_global_cumul = false) {
-
+std::vectorkin_forward_wrapper(
+ torch::Tensor link_pos, torch::Tensor link_quat,
+ torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+ const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+ const torch::Tensor robot_spheres, const torch::Tensor link_map,
+ const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+ const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+ const int batch_size, const int n_spheres,
+ const bool use_global_cumul = false)
+{
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
// TODO: add check input
return kin_fused_forward(
- link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
- fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
- store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
+ link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
+ fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
+ store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
}
-std::vector kin_backward_wrapper(
- torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
- const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
- const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
- const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
- const torch::Tensor link_map, const torch::Tensor joint_map,
- const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
- const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
- const int batch_size, const int n_spheres, const bool sparsity_opt = true,
- const bool use_global_cumul = false) {
+std::vectorkin_backward_wrapper(
+ torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+ const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+ const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+ const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+ const torch::Tensor link_map, const torch::Tensor joint_map,
+ const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+ const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+ const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+ const bool use_global_cumul = false)
+{
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
return kin_fused_backward_16t(
- grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
- global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
- joint_map, joint_map_type, store_link_map, link_sphere_map,
- link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
+ grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
+ global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
+ joint_map, joint_map_type, store_link_map, link_sphere_map,
+ link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
}
std::vector
-matrix_to_quaternion_wrapper(torch::Tensor out_quat,
+matrix_to_quaternion_wrapper(torch::Tensor out_quat,
const torch::Tensor in_rot // batch_size, 3
-) {
+ )
+{
const at::cuda::OptionalCUDAGuard guard(in_rot.device());
+
CHECK_INPUT(in_rot);
CHECK_INPUT(out_quat);
return matrix_to_quaternion(out_quat, in_rot);
}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
- m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
- m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+ m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
+ m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
"Rotation Matrix to Quaternion (CUDA)");
}
diff --git a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
index 9b2b56c..4b71f57 100644
--- a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
+++ b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
@@ -35,1003 +35,1172 @@
#define Y_ROT_NEG 10
#define Z_ROT_NEG 11
-#define MAX_BATCH_PER_BLOCK 32 // tunable parameter for improving occupancy
+#define MAX_BATCH_PER_BLOCK 32 // tunable parameter for improving occupancy
#define MAX_BW_BATCH_PER_BLOCK 16 // tunable parameter for improving occupancy
-#define MAX_TOTAL_LINKS \
- 750 // limited by shared memory size. We need to fit 16 * float32 per link
+#define MAX_TOTAL_LINKS \
+ 750 // limited by shared memory size. We need to fit 16 * float32 per
+ // link
-namespace Curobo {
-
-namespace Kinematics {
-
-template
-__device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
- float3 scale, psum_t &sum_out) {
- sum_out += scale.x * (a.y * b.z - a.z * b.y) +
- scale.y * (a.z * b.x - a.x * b.z) +
- scale.z * (a.x * b.y - a.y * b.x);
-}
-
-__device__ __forceinline__ void normalize_quaternion(float *q) {
- // get length:
- float length = 1.0 / norm4df(q[0], q[1], q[2], q[3]);
- if (q[0] < 0.0) {
- length = -1.0 * length;
- }
-
- q[1] = length * q[1];
- q[2] = length * q[2];
- q[3] = length * q[3];
- q[0] = length * q[0];
-}
-/**
- * @brief get quaternion from transformation matrix
- *
- * @param t transformation matrix 4x4
- * @param q quaternion in wxyz format
- */
-__device__ __forceinline__ void mat_to_quat(float *t, float *q) {
- float n;
- float n_sqrt;
- if (t[10] < 0.0) {
- if (t[0] > t[5]) {
- n = 1 + t[0] - t[5] - t[10];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = n * n_sqrt;
- q[2] = (t[1] + t[4]) * n_sqrt;
- q[3] = (t[8] + t[2]) * n_sqrt;
- q[0] = -1 * (t[6] - t[9]) * n_sqrt; // * -1 ; // this is the wrong one?
- } else {
- n = 1 - t[0] + t[5] - t[10];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = (t[1] + t[4]) * n_sqrt;
- q[2] = n * n_sqrt;
- q[3] = (t[6] + t[9]) * n_sqrt;
- q[0] = -1 * (t[8] - t[2]) * n_sqrt;
+namespace Curobo
+{
+ namespace Kinematics
+ {
+ template
+ __device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
+ float3 scale, psum_t& sum_out)
+ {
+ sum_out += scale.x * (a.y * b.z - a.z * b.y) +
+ scale.y * (a.z * b.x - a.x * b.z) +
+ scale.z * (a.x * b.y - a.y * b.x);
}
- } else {
- if (t[0] < -1 * t[5]) {
- n = 1 - t[0] - t[5] + t[10];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = (t[8] + t[2]) * n_sqrt;
- q[2] = (t[6] + t[9]) * n_sqrt;
- q[3] = n * n_sqrt;
- q[0] = -1 * (t[1] - t[4]) * n_sqrt;
- } else {
- n = 1 + t[0] + t[5] + t[10];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = (t[6] - t[9]) * n_sqrt;
- q[2] = (t[8] - t[2]) * n_sqrt;
- q[3] = (t[1] - t[4]) * n_sqrt;
- q[0] = -1 * n * n_sqrt;
- }
- }
- normalize_quaternion(q);
-}
-/**
- * @brief get quaternion from transformation matrix
- *
- * @param t # rotation matrix 3x3
- * @param q quaternion in wxyz format
- */
-__device__ __forceinline__ void rot_to_quat(float *t, float *q) {
- float n;
- float n_sqrt;
- if (t[8] < 0.0) {
- if (t[0] > t[4]) {
- n = 1 + t[0] - t[4] - t[8];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = n * n_sqrt;
- q[2] = (t[1] + t[3]) * n_sqrt;
- q[3] = (t[6] + t[2]) * n_sqrt;
- q[0] = -1 * (t[5] - t[7]) * n_sqrt; // * -1 ; // this is the wrong one?
- } else {
- n = 1 - t[0] + t[4] - t[8];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = (t[1] + t[3]) * n_sqrt;
- q[2] = n * n_sqrt;
- q[3] = (t[5] + t[7]) * n_sqrt;
- q[0] = -1 * (t[6] - t[2]) * n_sqrt;
- }
- } else {
- if (t[0] < -1 * t[4]) {
- n = 1 - t[0] - t[4] + t[8];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = (t[6] + t[2]) * n_sqrt;
- q[2] = (t[5] + t[7]) * n_sqrt;
- q[3] = n * n_sqrt;
- q[0] = -1 * (t[1] - t[3]) * n_sqrt;
- } else {
- n = 1 + t[0] + t[4] + t[8];
- n_sqrt = 0.5 * rsqrtf(n);
- q[1] = (t[5] - t[7]) * n_sqrt;
- q[2] = (t[6] - t[2]) * n_sqrt;
- q[3] = (t[1] - t[3]) * n_sqrt;
- q[0] = -1 * n * n_sqrt;
- }
- }
- normalize_quaternion(q);
-}
+ __device__ __forceinline__ void normalize_quaternion(float *q)
+ {
+ // get length:
+ float length = 1.0 / norm4df(q[0], q[1], q[2], q[3]);
-__device__ __forceinline__ void rot_mul(float *r1, float *r2, float *r_out) {
- for (int i = 0; i < 9; i++) {
- r_out[i] = 0.0;
- }
+ if (q[0] < 0.0)
+ {
+ length = -1.0 * length;
+ }
+
+ q[1] = length * q[1];
+ q[2] = length * q[2];
+ q[3] = length * q[3];
+ q[0] = length * q[0];
+ }
+
+ /**
+ * @brief get quaternion from transformation matrix
+ *
+ * @param t transformation matrix 4x4
+ * @param q quaternion in wxyz format
+ */
+ __device__ __forceinline__ void mat_to_quat(float *t, float *q)
+ {
+ float n;
+ float n_sqrt;
+
+ if (t[10] < 0.0)
+ {
+ if (t[0] > t[5])
+ {
+ n = 1 + t[0] - t[5] - t[10];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = n * n_sqrt;
+ q[2] = (t[1] + t[4]) * n_sqrt;
+ q[3] = (t[8] + t[2]) * n_sqrt;
+ q[0] = -1 * (t[6] - t[9]) * n_sqrt; // * -1 ; // this is the wrong one?
+ }
+ else
+ {
+ n = 1 - t[0] + t[5] - t[10];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = (t[1] + t[4]) * n_sqrt;
+ q[2] = n * n_sqrt;
+ q[3] = (t[6] + t[9]) * n_sqrt;
+ q[0] = -1 * (t[8] - t[2]) * n_sqrt;
+ }
+ }
+ else
+ {
+ if (t[0] < -1 * t[5])
+ {
+ n = 1 - t[0] - t[5] + t[10];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = (t[8] + t[2]) * n_sqrt;
+ q[2] = (t[6] + t[9]) * n_sqrt;
+ q[3] = n * n_sqrt;
+ q[0] = -1 * (t[1] - t[4]) * n_sqrt;
+ }
+ else
+ {
+ n = 1 + t[0] + t[5] + t[10];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = (t[6] - t[9]) * n_sqrt;
+ q[2] = (t[8] - t[2]) * n_sqrt;
+ q[3] = (t[1] - t[4]) * n_sqrt;
+ q[0] = -1 * n * n_sqrt;
+ }
+ }
+ normalize_quaternion(q);
+ }
+
+ /**
+ * @brief get quaternion from transformation matrix
+ *
+ * @param t # rotation matrix 3x3
+ * @param q quaternion in wxyz format
+ */
+ __device__ __forceinline__ void rot_to_quat(float *t, float *q)
+ {
+ float n;
+ float n_sqrt;
+
+ if (t[8] < 0.0)
+ {
+ if (t[0] > t[4])
+ {
+ n = 1 + t[0] - t[4] - t[8];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = n * n_sqrt;
+ q[2] = (t[1] + t[3]) * n_sqrt;
+ q[3] = (t[6] + t[2]) * n_sqrt;
+ q[0] = -1 * (t[5] - t[7]) * n_sqrt; // * -1 ; // this is the wrong one?
+ }
+ else
+ {
+ n = 1 - t[0] + t[4] - t[8];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = (t[1] + t[3]) * n_sqrt;
+ q[2] = n * n_sqrt;
+ q[3] = (t[5] + t[7]) * n_sqrt;
+ q[0] = -1 * (t[6] - t[2]) * n_sqrt;
+ }
+ }
+ else
+ {
+ if (t[0] < -1 * t[4])
+ {
+ n = 1 - t[0] - t[4] + t[8];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = (t[6] + t[2]) * n_sqrt;
+ q[2] = (t[5] + t[7]) * n_sqrt;
+ q[3] = n * n_sqrt;
+ q[0] = -1 * (t[1] - t[3]) * n_sqrt;
+ }
+ else
+ {
+ n = 1 + t[0] + t[4] + t[8];
+ n_sqrt = 0.5 * rsqrtf(n);
+ q[1] = (t[5] - t[7]) * n_sqrt;
+ q[2] = (t[6] - t[2]) * n_sqrt;
+ q[3] = (t[1] - t[3]) * n_sqrt;
+ q[0] = -1 * n * n_sqrt;
+ }
+ }
+ normalize_quaternion(q);
+ }
+
+ __device__ __forceinline__ void rot_mul(float *r1, float *r2, float *r_out)
+ {
+ for (int i = 0; i < 9; i++)
+ {
+ r_out[i] = 0.0;
+ }
#pragma unroll
- for (int k = 0; k < 3; k++) {
+
+ for (int k = 0; k < 3; k++)
+ {
#pragma unroll
- for (int j = 0; j < 3; j++) {
+
+ for (int j = 0; j < 3; j++)
+ {
#pragma unroll
- for (int i = 0; i < 3; i++) {
- r_out[i * 3 + j] += r1[i * 3 + k] * r2[k * 3 + j];
+
+ for (int i = 0; i < 3; i++)
+ {
+ r_out[i * 3 + j] += r1[i * 3 + k] * r2[k * 3 + j];
+ }
+ }
}
}
- }
-}
-__device__ __forceinline__ void rot_inverse_rot_mul(float *r1, float *r2,
- float *r_out) {
- // multiply two matrices:
- r_out[0] = r1[0] * r2[0] + r1[4] * r2[4] + r1[8] * r2[8];
- r_out[1] = r1[0] * r2[1] + r1[4] * r2[5] + r1[8] * r2[9];
- r_out[2] = r1[0] * r2[2] + r1[4] * r2[6] + r1[8] * r2[10];
- r_out[3] = r1[1] * r2[0] + r1[5] * r2[4] + r1[9] * r2[8];
- r_out[4] = r1[1] * r2[1] + r1[5] * r2[5] + r1[9] * r2[9];
- r_out[5] = r1[1] * r2[2] + r1[5] * r2[6] + r1[9] * r2[10];
+ __device__ __forceinline__ void rot_inverse_rot_mul(float *r1, float *r2,
+ float *r_out)
+ {
+ // multiply two matrices:
+ r_out[0] = r1[0] * r2[0] + r1[4] * r2[4] + r1[8] * r2[8];
+ r_out[1] = r1[0] * r2[1] + r1[4] * r2[5] + r1[8] * r2[9];
+ r_out[2] = r1[0] * r2[2] + r1[4] * r2[6] + r1[8] * r2[10];
- r_out[6] = r1[2] * r2[0] + r1[6] * r2[4] + r1[10] * r2[8];
- r_out[7] = r1[2] * r2[1] + r1[6] * r2[5] + r1[10] * r2[9];
- r_out[8] = r1[2] * r2[2] + r1[6] * r2[6] + r1[10] * r2[10];
-}
+ r_out[3] = r1[1] * r2[0] + r1[5] * r2[4] + r1[9] * r2[8];
+ r_out[4] = r1[1] * r2[1] + r1[5] * r2[5] + r1[9] * r2[9];
+ r_out[5] = r1[1] * r2[2] + r1[5] * r2[6] + r1[9] * r2[10];
+
+ r_out[6] = r1[2] * r2[0] + r1[6] * r2[4] + r1[10] * r2[8];
+ r_out[7] = r1[2] * r2[1] + r1[6] * r2[5] + r1[10] * r2[9];
+ r_out[8] = r1[2] * r2[2] + r1[6] * r2[6] + r1[10] * r2[10];
+ }
+
+ template
+ __device__ __forceinline__ void
+ transform_sphere(const float *transform_mat, const scalar_t *sphere, float *C)
+ {
+ float4 sphere_pos = *(float4 *)&sphere[0];
+ int st_idx = 0;
-template
-__device__ __forceinline__ void
-transform_sphere(const float *transform_mat, const scalar_t *sphere, float *C) {
- float4 sphere_pos = *(float4 *)&sphere[0];
- int st_idx = 0;
#pragma unroll 3
- for (int i = 0; i < 3; i++) {
- st_idx = i * 4;
- // do dot product:
- // C[i] = transform_mat[st_idx] * sphere_pos.x + transform_mat[st_idx+1] *
- // sphere_pos.y + transform_mat[st_idx+2] * sphere_pos.z +
- // transform_mat[st_idx + 3];
- float4 tm = *(float4 *)&transform_mat[st_idx];
- C[i] =
- tm.x * sphere_pos.x + tm.y * sphere_pos.y + tm.z * sphere_pos.z + tm.w;
- }
- C[3] = sphere_pos.w;
-}
-template
-__device__ __forceinline__ void fixed_joint_fn(const scalar_t *fixedTransform,
- float *JM) {
- JM[0] = fixedTransform[0];
- JM[1] = fixedTransform[M];
- JM[2] = fixedTransform[M * 2];
- JM[3] = fixedTransform[M * 3];
-}
+ for (int i = 0; i < 3; i++)
+ {
+ st_idx = i * 4;
-// prism_fn withOUT control flow
-template
-__device__ __forceinline__ void prism_fn(const scalar_t *fixedTransform,
- const float angle, const int col_idx,
- float *JM, const int xyz) {
- // int _and = (col_idx & (col_idx>>1)) & 0x1; // 1 for thread 3, 0 for all
- // other threads (0,1,2)
- //
- // float f1 = (1-_and) + _and * angle; // 1 for threads 0,1,2; angle for
- // thread 3 int addr_offset = (1-_and) * col_idx + _and * xyz; // col_idx
- // for threads 0,1,2; xyz for thread 3
- //
- // JM[0] = fixedTransform[0 + addr_offset] * f1 + _and *
- // fixedTransform[3];//FT_0[1]; JM[1] = fixedTransform[M + addr_offset]
- // * f1 + _and * fixedTransform[M + 3];//FT_1[1]; JM[2] = fixedTransform[M
- // + M + addr_offset] * f1 + _and * fixedTransform[M + M + 3];//FT_2[1];
- // JM[3] = fixedTransform[M + M + M + addr_offset] * (1-_and) + _and * 1; //
- // first three threads will get fixedTransform[3M+col_idx], the last thread
- // will get 1
-
- if (col_idx <= 2) {
- fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
- } else {
- JM[0] = fixedTransform[0 + xyz] * angle + fixedTransform[3]; // FT_0[1];
- JM[1] = fixedTransform[M + xyz] * angle + fixedTransform[M + 3]; // FT_1[1];
- JM[2] = fixedTransform[M + M + xyz] * angle +
- fixedTransform[M + M + 3]; // FT_2[1];
- JM[3] = 1;
- }
-}
-
-__device__ __forceinline__ void update_joint_type_direction(int &j_type, int8_t &axis_sign)
-{
- // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-
- // Don't do anything if j_type < 6. j_type range is [0, 11]
- // divergence here.
- axis_sign = 1;
- if (j_type >= 6)
- {
- j_type -= 6;
- axis_sign = -1;
- }
-}
-
-
-__device__ __forceinline__ void update_joint_type_direction(int &j_type)
-{
- // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-
- // Don't do anything if j_type < 6. j_type range is [0, 11]
- // divergence here.
- if (j_type >= 6)
- {
- j_type -= 6;
- }
-}
-
-__device__ __forceinline__ void update_axis_direction(
- float &angle,
- int &j_type)
-{
- // Assume that input j_type >= 0 . Check fixed joint outside of this function.
- // sign should be +ve <= 5 and -ve >5
- // j_type range is [0, 11].
- // cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 = +ve,
- // then this code should be j_type - 5.
- angle = -1 * copysignf(1.0, j_type - 6) * angle;
-
- update_joint_type_direction(j_type);
-
-}
-
-// In the following versions of rot_fn, some non-nan values may become nan as we
-// add multiple values instead of using if-else/switch-case.
-
-// version with no control flow
-template
-__device__ __forceinline__ void xrot_fn(const scalar_t *fixedTransform,
- const float angle, const int col_idx,
- float *JM) {
- // we found no change in convergence between fast approximate and IEEE sin,
- // cos functions using fast approximate method saves 5 registers per thread.
- float cos = __cosf(angle);
- float sin = __sinf(angle);
- float n_sin = -1 * sin;
-
- int bit1 = col_idx & 0x1;
- int bit2 = (col_idx & 0x2) >> 1;
- int _xor = bit1 ^ bit2; // 0 for threads 0 and 3, 1 for threads 1 and 2
- int col_idx_by_2 =
- col_idx / 2; // 0 for threads 0 and 1, 1 for threads 2 and 3
-
- float f1 = (1 - col_idx_by_2) * cos +
- col_idx_by_2 * n_sin; // thread 1 get cos , thread 2 gets n_sin
- float f2 = (1 - col_idx_by_2) * sin +
- col_idx_by_2 * cos; // thread 1 get sin, thread 2 gets cos
-
- f1 = _xor * f1 + (1 - _xor) * 1; // threads 1 and 2 will get f1; the other
- // two threads will get 1
- f2 = _xor *
- f2; // threads 1 and 2 will get f2, the other two threads will get 0.0
- float f3 = 1 - _xor;
-
- int addr_offset =
- _xor + (1 - _xor) *
- col_idx; // 1 for threads 1 and 2, col_idx for threads 0 and 3
-
- JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
- JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
- JM[2] =
- fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
- JM[3] = fixedTransform[M + M + M + addr_offset] *
- f3; // threads 1 and 2 get 0.0, remaining two get fixedTransform[3M];
-}
-
-// version with no control flow
-template
-__device__ __forceinline__ void yrot_fn(const scalar_t *fixedTransform,
- const float angle, const int col_idx,
- float *JM) {
- float cos = __cosf(angle);
- float sin = __sinf(angle);
- float n_sin = -1 * sin;
-
- int col_idx_per_2 =
- col_idx % 2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
- int col_idx_by_2 =
- col_idx / 2; // threads 0 and 1 will be 0 and threads 2 and 3 will be 1.
-
- float f1 = (1 - col_idx_by_2) * cos +
- col_idx_by_2 * sin; // thread 0 get cos , thread 2 gets sin
- float f2 = (1 - col_idx_by_2) * n_sin +
- col_idx_by_2 * cos; // thread 0 get n_sin, thread 2 gets cos
-
- f1 = (1 - col_idx_per_2) * f1 +
- col_idx_per_2 * 1; // threads 0 and 2 will get f1; the other two
- // threads will get 1
- f2 = (1 - col_idx_per_2) *
- f2; // threads 0 and 2 will get f2, the other two threads will get 0.0
- float f3 =
- col_idx_per_2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
-
- int addr_offset =
- col_idx_per_2 *
- col_idx; // threads 0 and 2 will get 0, the other two will get col_idx.
-
- JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
- JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
- JM[2] =
- fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
- JM[3] = fixedTransform[M + M + M + addr_offset] *
- f3; // threads 0 and 2 threads get 0.0, remaining two get
- // fixedTransform[3M];
-}
-
-// version with no control flow
-template
-__device__ __forceinline__ void zrot_fn(const scalar_t *fixedTransform,
- const float angle, const int col_idx,
- float *JM) {
- float cos = __cosf(angle);
- float sin = __sinf(angle);
- float n_sin = -1 * sin;
-
- int col_idx_by_2 =
- col_idx / 2; // first two threads will be 0 and the next two will be 1.
- int col_idx_per_2 =
- col_idx % 2; // first thread will be 0 and the second thread will be 1.
- float f1 = (1 - col_idx_per_2) * cos +
- col_idx_per_2 * n_sin; // thread 0 get cos , thread 1 gets n_sin
- float f2 = (1 - col_idx_per_2) * sin +
- col_idx_per_2 * cos; // thread 0 get sin, thread 1 gets cos
-
- f1 = (1 - col_idx_by_2) * f1 +
- col_idx_by_2 * 1; // first two threads get f1, other two threads get 1
- f2 = (1 - col_idx_by_2) *
- f2; // first two threads get f2, other two threads get 0.0
-
- int addr_offset =
- col_idx_by_2 *
- col_idx; // first 2 threads will get 0, the other two will get col_idx.
-
- JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[1];
- JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 1];
- JM[2] =
- fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 1];
- JM[3] = fixedTransform[M + M + M + addr_offset] *
- col_idx_by_2; // first two threads get 0.0, remaining two get
- // fixedTransform[3M];
-}
-
-template
-__device__ __forceinline__ void
-rot_backward_translation(const float3 &vec, float *cumul_mat, float *l_pos,
- const float3 &loc_grad, psum_t &grad_q, const int8_t axis_sign=1) {
-
- float3 e_pos, j_pos;
-
- e_pos.x = cumul_mat[3];
- e_pos.y = cumul_mat[4 + 3];
- e_pos.z = cumul_mat[4 + 4 + 3];
-
- // compute position gradient:
- j_pos = *(float3 *)&l_pos[0] - e_pos; // - e_pos;
- float3 scale_grad = axis_sign * loc_grad;
- scale_cross_sum(vec, j_pos, scale_grad, grad_q); // cross product
-}
-
-
-template
-__device__ __forceinline__ void
-rot_backward_rotation(const float3 vec, const float3 grad_vec, psum_t &grad_q, const int8_t axis_sign=1) {
- grad_q += axis_sign * dot(vec, grad_vec);
-}
-
-
-template
-__device__ __forceinline__ void
-prism_backward_translation(const float3 vec, const float3 grad_vec,
- psum_t &grad_q, const int8_t axis_sign=1) {
- grad_q += axis_sign * dot(vec, grad_vec);
-}
-template
-__device__ __forceinline__ void
-z_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
- float3 &loc_grad_orientation, psum_t &grad_q, const int8_t axis_sign=1) {
- float3 vec =
- make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
-
- // get rotation vector:
- rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
- loc_grad_position, grad_q, axis_sign);
-
- rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void
-x_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
- float3 &loc_grad_orientation, psum_t &grad_q, const int8_t axis_sign=1) {
- float3 vec =
- make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
-
- // get rotation vector:
- rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
- loc_grad_position, grad_q, axis_sign);
-
- rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
-}
-template
-__device__ __forceinline__ void
-y_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
- float3 &loc_grad_orientation, psum_t &grad_q, const int8_t axis_sign=1) {
- float3 vec =
- make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
-
- // get rotation vector:
- rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
- loc_grad_position, grad_q, axis_sign);
-
- rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void
-xyz_prism_backward_translation(float *cumul_mat, float3 &loc_grad,
- psum_t &grad_q, int xyz, const int8_t axis_sign=1) {
- prism_backward_translation(
- make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
- loc_grad, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void x_prism_backward_translation(float *cumul_mat,
- float3 &loc_grad,
- psum_t &grad_q,
- const int8_t axis_sign=1) {
- // get rotation vector:
- prism_backward_translation(
- make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), loc_grad, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void y_prism_backward_translation(float *cumul_mat,
- float3 &loc_grad,
- psum_t &grad_q,
- const int8_t axis_sign=1) {
- // get rotation vector:
- prism_backward_translation(
- make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), loc_grad, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void z_prism_backward_translation(float *cumul_mat,
- float3 &loc_grad,
- psum_t &grad_q,
- const int8_t axis_sign=1) {
- // get rotation vector:
- prism_backward_translation(
- make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
-}
-
-__device__ __forceinline__ void
-xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
- float &grad_q, int xyz, const int8_t axis_sign=1) {
- // get rotation vector:
- rot_backward_translation(
- make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
- &cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void
-x_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
- psum_t &grad_q, const int8_t axis_sign=1) {
- // get rotation vector:
- rot_backward_translation(
- make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), &cumul_mat[0],
- &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void
-y_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
- psum_t &grad_q, const int8_t axis_sign=1) {
- // get rotation vector:
- rot_backward_translation(
- make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), &cumul_mat[0],
- &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-template
-__device__ __forceinline__ void
-z_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
- psum_t &grad_q, const int8_t axis_sign=1) {
- // get rotation vector:
- rot_backward_translation(
- make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), &cumul_mat[0],
- &l_pos[0], loc_grad, grad_q, axis_sign);
-}
-
-// An optimized version of kin_fused_warp_kernel.
-// This one should be about 10% faster.
-template
-__global__ void
-kin_fused_warp_kernel2(scalar_t *link_pos, // batchSize xz store_n_links x M x M
- scalar_t *link_quat, // batchSize x store_n_links x M x M
- scalar_t *b_robot_spheres, // batchSize x nspheres x M
- scalar_t *global_cumul_mat,
- const scalar_t *q, // batchSize x njoints
- const scalar_t *fixedTransform, // nlinks x M x M
- const scalar_t *robot_spheres, // nspheres x M
- const int8_t *jointMapType, // nlinks
- const int16_t *jointMap, // nlinks
- const int16_t *linkMap, // nlinks
- const int16_t *storeLinkMap, // store_n_links
- const int16_t *linkSphereMap, // nspheres
- const int batchSize, const int nspheres,
- const int nlinks, const int njoints,
- const int store_n_links) {
- extern __shared__ float cumul_mat[];
-
- int t = blockDim.x * blockIdx.x + threadIdx.x;
- const int batch = t / 4;
- if (batch >= batchSize)
- return;
-
- int col_idx = threadIdx.x % 4;
- const int local_batch = threadIdx.x / 4;
- const int matAddrBase = local_batch * nlinks * M * M;
- // read all fixed transforms to local cache:
-
- // copy base link transform:
- *(float4 *)&cumul_mat[matAddrBase + col_idx * M] =
- *(float4 *)&fixedTransform[col_idx * M];
-
- if (use_global_cumul) {
- *(float4 *)&global_cumul_mat[batch * nlinks * 16 + col_idx * M] =
- *(float4 *)&cumul_mat[matAddrBase + col_idx * M];
- }
- for (int8_t l = 1; l < nlinks; l++) //
- {
-
- // get one row of fixedTransform
- int ftAddrStart = l * M * M;
- int inAddrStart = matAddrBase + linkMap[l] * M * M;
- int outAddrStart = matAddrBase + l * M * M;
-
- // row index:
- // check joint type and use one of the helper functions:
- float JM[M];
- int j_type = jointMapType[l];
-
- if (j_type == FIXED)
- {
- fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
- }
- else
- {
- float angle = q[batch * njoints + jointMap[l]];
- update_axis_direction(angle, j_type);
- if (j_type <= Z_PRISM) {
- prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
- } else if (j_type == X_ROT) {
- xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
- } else if (j_type == Y_ROT) {
- yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
- } else if (j_type == Z_ROT) {
- zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
- } else {
- assert(j_type >= FIXED && j_type <= Z_ROT);
+ // do dot product:
+ // C[i] = transform_mat[st_idx] * sphere_pos.x + transform_mat[st_idx+1] *
+ // sphere_pos.y + transform_mat[st_idx+2] * sphere_pos.z +
+ // transform_mat[st_idx + 3];
+ float4 tm = *(float4 *)&transform_mat[st_idx];
+ C[i] =
+ tm.x * sphere_pos.x + tm.y * sphere_pos.y + tm.z * sphere_pos.z + tm.w;
}
- }
+ C[3] = sphere_pos.w;
+ }
+
+ template
+ __device__ __forceinline__ void fixed_joint_fn(const scalar_t *fixedTransform,
+ float *JM)
+ {
+ JM[0] = fixedTransform[0];
+ JM[1] = fixedTransform[M];
+ JM[2] = fixedTransform[M * 2];
+ JM[3] = fixedTransform[M * 3];
+ }
+
+ // prism_fn withOUT control flow
+ template
+ __device__ __forceinline__ void prism_fn(const scalar_t *fixedTransform,
+ const float angle, const int col_idx,
+ float *JM, const int xyz)
+ {
+ // int _and = (col_idx & (col_idx>>1)) & 0x1; // 1 for thread 3, 0 for all
+ // other threads (0,1,2)
+ //
+ // float f1 = (1-_and) + _and * angle; // 1 for threads 0,1,2; angle for
+ // thread 3 int addr_offset = (1-_and) * col_idx + _and * xyz; // col_idx
+ // for threads 0,1,2; xyz for thread 3
+ //
+ // JM[0] = fixedTransform[0 + addr_offset] * f1 + _and *
+ // fixedTransform[3];//FT_0[1]; JM[1] = fixedTransform[M + addr_offset]
+ // * f1 + _and * fixedTransform[M + 3];//FT_1[1]; JM[2] = fixedTransform[M
+ // + M + addr_offset] * f1 + _and * fixedTransform[M + M + 3];//FT_2[1];
+ // JM[3] = fixedTransform[M + M + M + addr_offset] * (1-_and) + _and * 1; //
+ // first three threads will get fixedTransform[3M+col_idx], the last thread
+ // will get 1
+
+ if (col_idx <= 2)
+ {
+ fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
+ }
+ else
+ {
+ JM[0] = fixedTransform[0 + xyz] * angle + fixedTransform[3]; // FT_0[1];
+ JM[1] = fixedTransform[M + xyz] * angle + fixedTransform[M + 3]; // FT_1[1];
+ JM[2] = fixedTransform[M + M + xyz] * angle +
+ fixedTransform[M + M + 3]; // FT_2[1];
+ JM[3] = 1;
+ }
+ }
+
+ __device__ __forceinline__ void update_joint_type_direction(int& j_type, int8_t& axis_sign)
+ {
+ // Assume that input j_type >= 0 . Check fixed joint outside of this function.
+
+ // Don't do anything if j_type < 6. j_type range is [0, 11]
+ // divergence here.
+ axis_sign = 1;
+
+ if (j_type >= 6)
+ {
+ j_type -= 6;
+ axis_sign = -1;
+ }
+ }
+
+ __device__ __forceinline__ void update_joint_type_direction(int& j_type)
+ {
+ // Assume that input j_type >= 0 . Check fixed joint outside of this function.
+
+ // Don't do anything if j_type < 6. j_type range is [0, 11]
+ // divergence here.
+ if (j_type >= 6)
+ {
+ j_type -= 6;
+ }
+ }
+
+ __device__ __forceinline__ void update_axis_direction(
+ float& angle,
+ int & j_type)
+ {
+ // Assume that input j_type >= 0 . Check fixed joint outside of this function.
+ // sign should be +ve <= 5 and -ve >5
+ // j_type range is [0, 11].
+ // cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 =
+ // +ve,
+ // then this code should be j_type - 5.
+ angle = -1 * copysignf(1.0, j_type - 6) * angle;
+
+ update_joint_type_direction(j_type);
+ }
+
+ // In the following versions of rot_fn, some non-nan values may become nan as we
+ // add multiple values instead of using if-else/switch-case.
+
+ // version with no control flow
+ template
+ __device__ __forceinline__ void xrot_fn(const scalar_t *fixedTransform,
+ const float angle, const int col_idx,
+ float *JM)
+ {
+ // we found no change in convergence between fast approximate and IEEE sin,
+ // cos functions using fast approximate method saves 5 registers per thread.
+ float cos = __cosf(angle);
+ float sin = __sinf(angle);
+ float n_sin = -1 * sin;
+
+ int bit1 = col_idx & 0x1;
+ int bit2 = (col_idx & 0x2) >> 1;
+ int _xor = bit1 ^ bit2; // 0 for threads 0 and 3, 1 for threads 1 and 2
+ int col_idx_by_2 =
+ col_idx / 2; // 0 for threads 0 and 1, 1 for threads 2 and 3
+
+ float f1 = (1 - col_idx_by_2) * cos +
+ col_idx_by_2 * n_sin; // thread 1 get cos , thread 2 gets n_sin
+ float f2 = (1 - col_idx_by_2) * sin +
+ col_idx_by_2 * cos; // thread 1 get sin, thread 2 gets cos
+
+ f1 = _xor * f1 + (1 - _xor) * 1; // threads 1 and 2 will get f1; the other
+ // two threads will get 1
+ f2 = _xor *
+ f2; // threads 1 and 2 will get f2, the other two threads will
+ // get 0.0
+ float f3 = 1 - _xor;
+
+ int addr_offset =
+ _xor + (1 - _xor) *
+ col_idx; // 1 for threads 1 and 2, col_idx for threads 0 and 3
+
+ JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
+ JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
+ JM[2] =
+ fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
+ JM[3] = fixedTransform[M + M + M + addr_offset] *
+ f3; // threads 1 and 2 get 0.0, remaining two get fixedTransform[3M];
+ }
+
+ // version with no control flow
+ template
+ __device__ __forceinline__ void yrot_fn(const scalar_t *fixedTransform,
+ const float angle, const int col_idx,
+ float *JM)
+ {
+ float cos = __cosf(angle);
+ float sin = __sinf(angle);
+ float n_sin = -1 * sin;
+
+ int col_idx_per_2 =
+ col_idx % 2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
+ int col_idx_by_2 =
+ col_idx / 2; // threads 0 and 1 will be 0 and threads 2 and 3 will be 1.
+
+ float f1 = (1 - col_idx_by_2) * cos +
+ col_idx_by_2 * sin; // thread 0 get cos , thread 2 gets sin
+ float f2 = (1 - col_idx_by_2) * n_sin +
+ col_idx_by_2 * cos; // thread 0 get n_sin, thread 2 gets cos
+
+ f1 = (1 - col_idx_per_2) * f1 +
+ col_idx_per_2 * 1; // threads 0 and 2 will get f1; the other two
+ // threads will get 1
+ f2 = (1 - col_idx_per_2) *
+ f2; // threads 0 and 2 will get f2, the other two threads will get
+ // 0.0
+ float f3 =
+ col_idx_per_2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
+
+ int addr_offset =
+ col_idx_per_2 *
+ col_idx; // threads 0 and 2 will get 0, the other two will get col_idx.
+
+ JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
+ JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
+ JM[2] =
+ fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
+ JM[3] = fixedTransform[M + M + M + addr_offset] *
+ f3; // threads 0 and 2 threads get 0.0, remaining two get
+ // fixedTransform[3M];
+ }
+
+ // version with no control flow
+ template
+ __device__ __forceinline__ void zrot_fn(const scalar_t *fixedTransform,
+ const float angle, const int col_idx,
+ float *JM)
+ {
+ float cos = __cosf(angle);
+ float sin = __sinf(angle);
+ float n_sin = -1 * sin;
+
+ int col_idx_by_2 =
+ col_idx / 2; // first two threads will be 0 and the next two will be 1.
+ int col_idx_per_2 =
+ col_idx % 2; // first thread will be 0 and the second thread will be 1.
+ float f1 = (1 - col_idx_per_2) * cos +
+ col_idx_per_2 * n_sin; // thread 0 get cos , thread 1 gets n_sin
+ float f2 = (1 - col_idx_per_2) * sin +
+ col_idx_per_2 * cos; // thread 0 get sin, thread 1 gets cos
+
+ f1 = (1 - col_idx_by_2) * f1 +
+ col_idx_by_2 * 1; // first two threads get f1, other two threads get 1
+ f2 = (1 - col_idx_by_2) *
+ f2; // first two threads get f2, other two threads get 0.0
+
+ int addr_offset =
+ col_idx_by_2 *
+ col_idx; // first 2 threads will get 0, the other two will get col_idx.
+
+ JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[1];
+ JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 1];
+ JM[2] =
+ fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 1];
+ JM[3] = fixedTransform[M + M + M + addr_offset] *
+ col_idx_by_2; // first two threads get 0.0, remaining two get
+ // fixedTransform[3M];
+ }
+
+ template
+ __device__ __forceinline__ void
+ rot_backward_translation(const float3& vec, float *cumul_mat, float *l_pos,
+ const float3& loc_grad, psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ float3 e_pos, j_pos;
+
+ e_pos.x = cumul_mat[3];
+ e_pos.y = cumul_mat[4 + 3];
+ e_pos.z = cumul_mat[4 + 4 + 3];
+
+ // compute position gradient:
+ j_pos = *(float3 *)&l_pos[0] - e_pos; // - e_pos;
+ float3 scale_grad = axis_sign * loc_grad;
+ scale_cross_sum(vec, j_pos, scale_grad, grad_q); // cross product
+ }
+
+ template
+ __device__ __forceinline__ void
+ rot_backward_rotation(const float3 vec,
+ const float3 grad_vec,
+ psum_t & grad_q,
+ const int8_t axis_sign = 1)
+ {
+ grad_q += axis_sign * dot(vec, grad_vec);
+ }
+
+ template
+ __device__ __forceinline__ void
+ prism_backward_translation(const float3 vec, const float3 grad_vec,
+ psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ grad_q += axis_sign * dot(vec, grad_vec);
+ }
+
+ template
+ __device__ __forceinline__ void
+ z_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
+ float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ float3 vec =
+ make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
+
+ // get rotation vector:
+ rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+ loc_grad_position, grad_q, axis_sign);
+
+ rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void
+ x_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
+ float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ float3 vec =
+ make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
+
+ // get rotation vector:
+ rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+ loc_grad_position, grad_q, axis_sign);
+
+ rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void
+ y_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
+ float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ float3 vec =
+ make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
+
+ // get rotation vector:
+ rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+ loc_grad_position, grad_q, axis_sign);
+
+ rot_backward_rotation(vec, loc_grad_orientation, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void
+ xyz_prism_backward_translation(float *cumul_mat, float3& loc_grad,
+ psum_t& grad_q, int xyz, const int8_t axis_sign = 1)
+ {
+ prism_backward_translation(
+ make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
+ loc_grad, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void x_prism_backward_translation(float *cumul_mat,
+ float3 & loc_grad,
+ psum_t & grad_q,
+ const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ prism_backward_translation(
+ make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), loc_grad, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void y_prism_backward_translation(float *cumul_mat,
+ float3 & loc_grad,
+ psum_t & grad_q,
+ const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ prism_backward_translation(
+ make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), loc_grad, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void z_prism_backward_translation(float *cumul_mat,
+ float3 & loc_grad,
+ psum_t & grad_q,
+ const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ prism_backward_translation(
+ make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
+ }
+
+ __device__ __forceinline__ void
+ xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+ float& grad_q, int xyz, const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ rot_backward_translation(
+ make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
+ &cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void
+ x_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+ psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ rot_backward_translation(
+ make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), &cumul_mat[0],
+ &l_pos[0], loc_grad, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void
+ y_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+ psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ rot_backward_translation(
+ make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), &cumul_mat[0],
+ &l_pos[0], loc_grad, grad_q, axis_sign);
+ }
+
+ template
+ __device__ __forceinline__ void
+ z_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
+ psum_t& grad_q, const int8_t axis_sign = 1)
+ {
+ // get rotation vector:
+ rot_backward_translation(
+ make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), &cumul_mat[0],
+ &l_pos[0], loc_grad, grad_q, axis_sign);
+ }
+
+ // An optimized version of kin_fused_warp_kernel.
+ // This one should be about 10% faster.
+ template
+ __global__ void
+ kin_fused_warp_kernel2(scalar_t *link_pos, // batchSize xz store_n_links x M x M
+ scalar_t *link_quat, // batchSize x store_n_links x M x M
+ scalar_t *b_robot_spheres, // batchSize x nspheres x M
+ scalar_t *global_cumul_mat,
+ const scalar_t *q, // batchSize x njoints
+ const scalar_t *fixedTransform, // nlinks x M x M
+ const scalar_t *robot_spheres, // nspheres x M
+ const int8_t *jointMapType, // nlinks
+ const int16_t *jointMap, // nlinks
+ const int16_t *linkMap, // nlinks
+ const int16_t *storeLinkMap, // store_n_links
+ const int16_t *linkSphereMap, // nspheres
+ const int batchSize, const int nspheres,
+ const int nlinks, const int njoints,
+ const int store_n_links)
+ {
+ extern __shared__ float cumul_mat[];
+
+ int t = blockDim.x * blockIdx.x + threadIdx.x;
+ const int batch = t / 4;
+
+ if (batch >= batchSize)
+ return;
+
+ int col_idx = threadIdx.x % 4;
+ const int local_batch = threadIdx.x / 4;
+ const int matAddrBase = local_batch * nlinks * M * M;
+
+ // read all fixed transforms to local cache:
+
+ // copy base link transform:
+ *(float4 *)&cumul_mat[matAddrBase + col_idx * M] =
+ *(float4 *)&fixedTransform[col_idx * M];
+
+ if (use_global_cumul)
+ {
+ *(float4 *)&global_cumul_mat[batch * nlinks * 16 + col_idx * M] =
+ *(float4 *)&cumul_mat[matAddrBase + col_idx * M];
+ }
+
+ for (int8_t l = 1; l < nlinks; l++) //
+ {
+ // get one row of fixedTransform
+ int ftAddrStart = l * M * M;
+ int inAddrStart = matAddrBase + linkMap[l] * M * M;
+ int outAddrStart = matAddrBase + l * M * M;
+
+ // row index:
+ // check joint type and use one of the helper functions:
+ float JM[M];
+ int j_type = jointMapType[l];
+
+ if (j_type == FIXED)
+ {
+ fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
+ }
+ else
+ {
+ float angle = q[batch * njoints + jointMap[l]];
+ update_axis_direction(angle, j_type);
+
+ if (j_type <= Z_PRISM)
+ {
+ prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
+ }
+ else if (j_type == X_ROT)
+ {
+ xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+ }
+ else if (j_type == Y_ROT)
+ {
+ yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+ }
+ else if (j_type == Z_ROT)
+ {
+ zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+ }
+ else
+ {
+ assert(j_type >= FIXED && j_type <= Z_ROT);
+ }
+ }
#pragma unroll 4
- for (int i = 0; i < M; i++) {
- cumul_mat[outAddrStart + (i * M) + col_idx] =
- dot(*(float4 *)&cumul_mat[inAddrStart + (i * M)], *(float4 *)JM);
- }
- if (use_global_cumul) {
- *(float4 *)&global_cumul_mat[batch * nlinks * 16 + l * 16 + col_idx * M] =
- *(float4 *)&cumul_mat[outAddrStart + col_idx * M];
- }
- }
- // write out link:
-
- // do robot_spheres
-
- const int batchAddrs = batch * nspheres * 4;
- // read cumul mat index to run for this thread:
- int16_t read_cumul_idx = -1;
- int16_t spheres_perthread = (nspheres + 3) / 4;
- for (int16_t i = 0; i < spheres_perthread; i++) {
- //const int16_t sph_idx = col_idx * spheres_perthread + i;
- const int16_t sph_idx = col_idx + i * 4;
- // const int8_t sph_idx =
- // i * 4 + col_idx; // different order such that adjacent
- // spheres are in neighboring threads
- if (sph_idx >= nspheres) {
- break;
- }
- // read cumul idx:
- read_cumul_idx = linkSphereMap[sph_idx];
- float spheres_mem[4];
- const int16_t sphAddrs = sph_idx * 4;
-
- transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
- &robot_spheres[sphAddrs], &spheres_mem[0]);
- *(float4 *)&b_robot_spheres[batchAddrs + sphAddrs] =
- *(float4 *)&spheres_mem[0];
- }
-
- // write position and rotation, we convert rotation matrix to a quaternion and
- // write it out
- for (int16_t i = 0; i < store_n_links; i++) {
- int16_t l_map = storeLinkMap[i];
- int l_outAddrStart =
- (batch * store_n_links); // * 7) + i * 7;// + (t % M) * M;
- int outAddrStart = matAddrBase + l_map * M * M;
-
- float quat[4];
- // TODO: spread the work to different threads. For now all the threads will
- // do the same work.
- mat_to_quat(
- &cumul_mat[outAddrStart],
- &quat[0]); // get quaternion, all the 4 threads will do the same work
- link_quat[l_outAddrStart * 4 + i * 4 + col_idx] =
- quat[col_idx]; // one thread will write one element to memory
-
- if (col_idx < 3) {
- // threads 0,1,2 will execute the following store
- link_pos[l_outAddrStart * 3 + i * 3 + col_idx] =
- cumul_mat[outAddrStart + 3 + (col_idx)*4];
- }
- }
-}
-
-// kin_fused_backward_kernel3 uses 16 threads per batch, instead of 4 per batch
-// as in kin_fused_backward_kernel2.
-template
-__global__ void kin_fused_backward_kernel3(
- scalar_t *grad_out_link_q, // batchSize * njoints
- const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
- const scalar_t *grad_nlinks_quat,
- const scalar_t *grad_spheres, // batchSize * nspheres * 4
- const scalar_t *global_cumul_mat,
- const scalar_t *q, // batchSize * njoints
- const scalar_t *fixedTransform, // nlinks * 16
- const scalar_t *robotSpheres, // batchSize * nspheres * 4
- const int8_t *jointMapType, // nlinks
- const int16_t *jointMap, // nlinks
- const int16_t *linkMap, // nlinks
- const int16_t *storeLinkMap, // store_n_links
- const int16_t *linkSphereMap, // nspheres
- const int16_t *linkChainMap, // nlinks*nlinks
- const int batchSize, const int nspheres, const int nlinks,
- const int njoints, const int store_n_links) {
- extern __shared__ float cumul_mat[];
-
- int t = blockDim.x * blockIdx.x + threadIdx.x;
- const int batch = t / 16;
- unsigned mask = __ballot_sync(0xffffffff, batch < batchSize);
-
- if (batch >= batchSize)
- return;
-
- // Each thread computes one element of the cumul_mat.
- // first 4 threads compute a row of the output;
- const int elem_idx = threadIdx.x % 16;
- const int col_idx = elem_idx % 4;
- const int local_batch = threadIdx.x / 16;
- const int matAddrBase = local_batch * nlinks * M * M;
-
- if (use_global_cumul) {
- for (int l = 0; l < nlinks; l++) {
- int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
- cumul_mat[outAddrStart + elem_idx] =
- global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
- }
- } else {
-
- cumul_mat[matAddrBase + elem_idx] = fixedTransform[elem_idx];
-
- for (int l = 1; l < nlinks; l++) // TODO: add base link transform
- {
- float JM[M]; // store one row locally for mat-mul
- int ftAddrStart = l * M * M; //+ (t % M) * M;
- int inAddrStart = matAddrBase + linkMap[l] * M * M;
- int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
-
- int j_type = jointMapType[l];
-
-
- if (j_type == FIXED) {
- fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
- }
- else {
- float angle = q[batch * njoints + jointMap[l]];
-
- update_axis_direction(angle, j_type);
-
- if (j_type <= Z_PRISM) {
- prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
- } else if (j_type == X_ROT) {
- xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
- } else if (j_type == Y_ROT) {
- yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
- } else if (j_type == Z_ROT) {
- zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
- } else {
- assert(j_type >= FIXED && j_type <= Z_ROT);
+ for (int i = 0; i < M; i++)
+ {
+ cumul_mat[outAddrStart + (i * M) + col_idx] =
+ dot(*(float4 *)&cumul_mat[inAddrStart + (i * M)], *(float4 *)JM);
+ }
+
+ if (use_global_cumul)
+ {
+ *(float4 *)&global_cumul_mat[batch * nlinks * 16 + l * 16 + col_idx * M] =
+ *(float4 *)&cumul_mat[outAddrStart + col_idx * M];
}
-
}
- // fetch one row of cumul_mat, multiply with a column, which is in JM
- cumul_mat[outAddrStart + elem_idx] =
- dot(*(float4 *)&cumul_mat[inAddrStart + ((elem_idx / 4) * M)],
- *(float4 *)JM);
- }
- }
- // thread-local partial sum accumulators
- // We would like to keep these partial sums in register file and avoid memory
- // accesses
- psum_t psum_grad[MAX_JOINTS]; // MAX_JOINTS
-// we are allocating a lot larger array. So, we will be initilizing just the
-// portion we need.
-#pragma unroll
- for (int i = 0; i < njoints; i++) {
- psum_grad[i] = 0.0;
- }
- // read cumul mat index to run for this thread:
- int read_cumul_idx = -1;
+ // write out link:
- const int spheres_perthread = (nspheres + 15) / 16;
+ // do robot_spheres
- for (int i = 0; i < spheres_perthread; i++) {
- //const int sph_idx = elem_idx * spheres_perthread + i;
- const int sph_idx = elem_idx + i * 16;
- if (sph_idx >= nspheres) {
- break;
- }
- const int sphAddrs = sph_idx * 4;
- const int batchAddrs = batch * nspheres * 4;
- float4 loc_grad_sphere_t = *(float4 *)&grad_spheres[batchAddrs + sphAddrs];
- // Sparsity-based optimization: Skip zero computation
- if (enable_sparsity_opt) {
- if (loc_grad_sphere_t.x == 0 && loc_grad_sphere_t.y == 0 &&
- loc_grad_sphere_t.z == 0) {
- continue;
- }
- }
- float3 loc_grad_sphere = make_float3(
- loc_grad_sphere_t.x, loc_grad_sphere_t.y, loc_grad_sphere_t.z);
+ const int batchAddrs = batch * nspheres * 4;
- // read cumul idx:
- read_cumul_idx = linkSphereMap[sph_idx];
- float spheres_mem[4];
- transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
- &robotSpheres[sphAddrs], &spheres_mem[0]);
- // assuming this sphere only depends on links lower than this index
- // This could be relaxed by making read_cumul_idx = number of links.
- //const int16_t loop_max = read_cumul_idx;
- const int16_t loop_max = nlinks - 1;
- for (int j = loop_max; j > -1; j--) {
- if (linkChainMap[read_cumul_idx * nlinks + j] == 0.0) {
- continue;
- }
- int8_t axis_sign = 1;
+ // read cumul mat index to run for this thread:
+ int16_t read_cumul_idx = -1;
+ int16_t spheres_perthread = (nspheres + 3) / 4;
- int j_type = jointMapType[j];
- if(j_type != FIXED)
+ for (int16_t i = 0; i < spheres_perthread; i++)
{
- update_joint_type_direction(j_type, axis_sign);
+ // const int16_t sph_idx = col_idx * spheres_perthread + i;
+ const int16_t sph_idx = col_idx + i * 4;
+
+ // const int8_t sph_idx =
+ // i * 4 + col_idx; // different order such that adjacent
+ // spheres are in neighboring threads
+ if (sph_idx >= nspheres)
+ {
+ break;
+ }
+
+ // read cumul idx:
+ read_cumul_idx = linkSphereMap[sph_idx];
+ float spheres_mem[4];
+ const int16_t sphAddrs = sph_idx * 4;
+
+ transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
+ &robot_spheres[sphAddrs], &spheres_mem[0]);
+ *(float4 *)&b_robot_spheres[batchAddrs + sphAddrs] =
+ *(float4 *)&spheres_mem[0];
}
- if (j_type == Z_ROT) {
- float result = 0.0;
- z_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
- &spheres_mem[0], loc_grad_sphere, result, axis_sign);
- psum_grad[jointMap[j]] += (psum_t)result;
- } else if (j_type >= X_PRISM && j_type <= Z_PRISM) {
- float result = 0.0;
- xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
- loc_grad_sphere, result, j_type, axis_sign);
- psum_grad[jointMap[j]] += (psum_t)result;
- } else if (j_type == X_ROT) {
- float result = 0.0;
- x_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
- &spheres_mem[0], loc_grad_sphere, result, axis_sign);
- psum_grad[jointMap[j]] += (psum_t)result;
- } else if (j_type == Y_ROT) {
- float result = 0.0;
- y_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
- &spheres_mem[0], loc_grad_sphere, result, axis_sign);
- psum_grad[jointMap[j]] += (psum_t)result;
+
+ // write position and rotation, we convert rotation matrix to a quaternion and
+ // write it out
+ for (int16_t i = 0; i < store_n_links; i++)
+ {
+ int16_t l_map = storeLinkMap[i];
+ int l_outAddrStart =
+ (batch * store_n_links); // * 7) + i * 7;// + (t % M) * M;
+ int outAddrStart = matAddrBase + l_map * M * M;
+
+ float quat[4];
+
+ // TODO: spread the work to different threads. For now all the threads will
+ // do the same work.
+ mat_to_quat(
+ &cumul_mat[outAddrStart],
+ &quat[0]); // get quaternion, all the 4 threads will do the same work
+ link_quat[l_outAddrStart * 4 + i * 4 + col_idx] =
+ quat[col_idx]; // one thread will write one element to memory
+
+ if (col_idx < 3)
+ {
+ // threads 0,1,2 will execute the following store
+ link_pos[l_outAddrStart * 3 + i * 3 + col_idx] =
+ cumul_mat[outAddrStart + 3 + (col_idx) * 4];
+ }
}
}
- }
-
- // Instead of accumulating the sphere_grad and link_grad separately, we will
- // accumulate them together once below.
- //
- // // accumulate across 4 threads using shuffle operation
- // for(int j=0; j= 0; k--)
- for (int16_t k=0; k < joints_per_thread; k++)
+ // kin_fused_backward_kernel3 uses 16 threads per batch, instead of 4 per batch
+ // as in kin_fused_backward_kernel2.
+ template
+ __global__ void kin_fused_backward_kernel3(
+ scalar_t *grad_out_link_q, // batchSize * njoints
+ const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
+ const scalar_t *grad_nlinks_quat,
+ const scalar_t *grad_spheres, // batchSize * nspheres * 4
+ const scalar_t *global_cumul_mat,
+ const scalar_t *q, // batchSize * njoints
+ const scalar_t *fixedTransform, // nlinks * 16
+ const scalar_t *robotSpheres, // batchSize * nspheres * 4
+ const int8_t *jointMapType, // nlinks
+ const int16_t *jointMap, // nlinks
+ const int16_t *linkMap, // nlinks
+ const int16_t *storeLinkMap, // store_n_links
+ const int16_t *linkSphereMap, // nspheres
+ const int16_t *linkChainMap, // nlinks*nlinks
+ const int batchSize, const int nspheres, const int nlinks,
+ const int njoints, const int store_n_links)
{
-
- //int16_t j = elem_idx * joints_per_thread + k;
- int16_t j = elem_idx + k * 16;
- //int16_t j = k * M + elem_idx;
- if (j > max_lmap || j < 0)
- continue;
- // This can be spread across threads as they are not sequential?
- if (linkChainMap[l_map * nlinks + j] == 0.0) {
- continue;
- }
- int16_t j_idx = jointMap[j];
- int j_type = jointMapType[j];
+ extern __shared__ float cumul_mat[];
- int8_t axis_sign = 1;
- if (j_type != FIXED)
+ int t = blockDim.x * blockIdx.x + threadIdx.x;
+ const int batch = t / 16;
+ unsigned mask = __ballot_sync(0xffffffff, batch < batchSize);
+
+ if (batch >= batchSize)
+ return;
+
+ // Each thread computes one element of the cumul_mat.
+ // first 4 threads compute a row of the output;
+ const int elem_idx = threadIdx.x % 16;
+ const int col_idx = elem_idx % 4;
+ const int local_batch = threadIdx.x / 16;
+ const int matAddrBase = local_batch * nlinks * M * M;
+
+ if (use_global_cumul)
{
- update_joint_type_direction(j_type, axis_sign);
+ for (int l = 0; l < nlinks; l++)
+ {
+ int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+ cumul_mat[outAddrStart + elem_idx] =
+ global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
+ }
}
-
- // get rotation vector:
- if (j_type == Z_ROT) {
- z_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
- g_position, g_orientation, psum_grad[j_idx], axis_sign);
- } else if (j_type >= X_PRISM & j_type <= Z_PRISM) {
- xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
- g_position, psum_grad[j_idx], j_type, axis_sign);
- } else if (j_type == X_ROT) {
- x_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
- g_position, g_orientation, psum_grad[j_idx], axis_sign);
- } else if (j_type == Y_ROT) {
- y_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
- g_position, g_orientation, psum_grad[j_idx], axis_sign);
- }
- }
- }
- if (PARALLEL_WRITE) {
-
-// accumulate the partial sums across the 16 threads
-#pragma unroll
- for (int16_t j = 0; j < njoints; j++) {
- psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 1);
- psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 2);
- psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 4);
- psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 8);
- // thread 0: psum_grad[j] will have the sum across 16 threads
- // write out using only thread 0
- }
-
- const int16_t joints_per_thread = (njoints + 15) / 16;
-
-#pragma unroll
- for (int16_t j = 0; j < joints_per_thread; j++) {
- //const int16_t j_idx = elem_idx * joints_per_thread + j;
- const int16_t j_idx = elem_idx + j * 16;
- if (j_idx >= njoints) {
- break;
- }
- grad_out_link_q[batch * njoints + j_idx] =
- psum_grad[j_idx]; // write the sum to memory
- }
- } else {
-#pragma unroll
- for (int16_t j = 0; j < njoints; j++) {
- psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 1);
- psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 2);
- psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 4);
- psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 8);
- // thread 0: psum_grad[j] will have the sum across 16 threads
- // write out using only thread 0
- }
-
- if (elem_idx > 0) {
- return;
- }
-
-#pragma unroll
- for (int16_t j = 0; j < njoints; j++) {
+ else
{
- grad_out_link_q[batch * njoints + j] =
- psum_grad[j]; // write the sum to memory
+ cumul_mat[matAddrBase + elem_idx] = fixedTransform[elem_idx];
+
+ for (int l = 1; l < nlinks; l++) // TODO: add base link transform
+ {
+ float JM[M]; // store one row locally for mat-mul
+ int ftAddrStart = l * M * M; // + (t % M) * M;
+ int inAddrStart = matAddrBase + linkMap[l] * M * M;
+ int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+
+ int j_type = jointMapType[l];
+
+
+ if (j_type == FIXED)
+ {
+ fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
+ }
+ else
+ {
+ float angle = q[batch * njoints + jointMap[l]];
+
+ update_axis_direction(angle, j_type);
+
+ if (j_type <= Z_PRISM)
+ {
+ prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
+ }
+ else if (j_type == X_ROT)
+ {
+ xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+ }
+ else if (j_type == Y_ROT)
+ {
+ yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+ }
+ else if (j_type == Z_ROT)
+ {
+ zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+ }
+ else
+ {
+ assert(j_type >= FIXED && j_type <= Z_ROT);
+ }
+ }
+
+ // fetch one row of cumul_mat, multiply with a column, which is in JM
+ cumul_mat[outAddrStart + elem_idx] =
+ dot(*(float4 *)&cumul_mat[inAddrStart + ((elem_idx / 4) * M)],
+ *(float4 *)JM);
+ }
}
+
+ // thread-local partial sum accumulators
+ // We would like to keep these partial sums in register file and avoid memory
+ // accesses
+ psum_t psum_grad[MAX_JOINTS]; // MAX_JOINTS
+ // we are allocating a lot larger array. So, we will be initilizing just the
+ // portion we need.
+#pragma unroll
+
+ for (int i = 0; i < njoints; i++)
+ {
+ psum_grad[i] = 0.0;
+ }
+
+ // read cumul mat index to run for this thread:
+ int read_cumul_idx = -1;
+
+ const int spheres_perthread = (nspheres + 15) / 16;
+
+ for (int i = 0; i < spheres_perthread; i++)
+ {
+ // const int sph_idx = elem_idx * spheres_perthread + i;
+ const int sph_idx = elem_idx + i * 16;
+
+ if (sph_idx >= nspheres)
+ {
+ break;
+ }
+ const int sphAddrs = sph_idx * 4;
+ const int batchAddrs = batch * nspheres * 4;
+ float4 loc_grad_sphere_t = *(float4 *)&grad_spheres[batchAddrs + sphAddrs];
+
+ // Sparsity-based optimization: Skip zero computation
+ if (enable_sparsity_opt)
+ {
+ if ((loc_grad_sphere_t.x == 0) && (loc_grad_sphere_t.y == 0) &&
+ (loc_grad_sphere_t.z == 0))
+ {
+ continue;
+ }
+ }
+ float3 loc_grad_sphere = make_float3(
+ loc_grad_sphere_t.x, loc_grad_sphere_t.y, loc_grad_sphere_t.z);
+
+ // read cumul idx:
+ read_cumul_idx = linkSphereMap[sph_idx];
+ float spheres_mem[4];
+ transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
+ &robotSpheres[sphAddrs], &spheres_mem[0]);
+
+ // assuming this sphere only depends on links lower than this index
+ // This could be relaxed by making read_cumul_idx = number of links.
+ // const int16_t loop_max = read_cumul_idx;
+ const int16_t loop_max = nlinks - 1;
+
+ for (int j = loop_max; j > -1; j--)
+ {
+ if (linkChainMap[read_cumul_idx * nlinks + j] == 0.0)
+ {
+ continue;
+ }
+ int8_t axis_sign = 1;
+
+ int j_type = jointMapType[j];
+
+ if (j_type != FIXED)
+ {
+ update_joint_type_direction(j_type, axis_sign);
+ }
+
+ if (j_type == Z_ROT)
+ {
+ float result = 0.0;
+ z_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+ &spheres_mem[0], loc_grad_sphere, result, axis_sign);
+ psum_grad[jointMap[j]] += (psum_t)result;
+ }
+ else if ((j_type >= X_PRISM) && (j_type <= Z_PRISM))
+ {
+ float result = 0.0;
+ xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
+ loc_grad_sphere, result, j_type, axis_sign);
+ psum_grad[jointMap[j]] += (psum_t)result;
+ }
+ else if (j_type == X_ROT)
+ {
+ float result = 0.0;
+ x_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+ &spheres_mem[0], loc_grad_sphere, result, axis_sign);
+ psum_grad[jointMap[j]] += (psum_t)result;
+ }
+ else if (j_type == Y_ROT)
+ {
+ float result = 0.0;
+ y_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+ &spheres_mem[0], loc_grad_sphere, result, axis_sign);
+ psum_grad[jointMap[j]] += (psum_t)result;
+ }
+ }
+ }
+
+
+ // Instead of accumulating the sphere_grad and link_grad separately, we will
+ // accumulate them together once below.
+ //
+ // // accumulate across 4 threads using shuffle operation
+ // for(int j=0; j= 0; k--)
+ for (int16_t k = 0; k < joints_per_thread; k++)
+ {
+ // int16_t j = elem_idx * joints_per_thread + k;
+ int16_t j = elem_idx + k * 16;
+
+ // int16_t j = k * M + elem_idx;
+ if ((j > max_lmap) || (j < 0))
+ continue;
+
+ // This can be spread across threads as they are not sequential?
+ if (linkChainMap[l_map * nlinks + j] == 0.0)
+ {
+ continue;
+ }
+ int16_t j_idx = jointMap[j];
+ int j_type = jointMapType[j];
+
+ int8_t axis_sign = 1;
+
+ if (j_type != FIXED)
+ {
+ update_joint_type_direction(j_type, axis_sign);
+ }
+
+ // get rotation vector:
+ if (j_type == Z_ROT)
+ {
+ z_rot_backward(&cumul_mat[matAddrBase + (j) * M * M], &l_pos[0],
+ g_position, g_orientation, psum_grad[j_idx], axis_sign);
+ }
+ else if (j_type >= X_PRISM & j_type <= Z_PRISM)
+ {
+ xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
+ g_position, psum_grad[j_idx], j_type, axis_sign);
+ }
+ else if (j_type == X_ROT)
+ {
+ x_rot_backward(&cumul_mat[matAddrBase + (j) * M * M], &l_pos[0],
+ g_position, g_orientation, psum_grad[j_idx], axis_sign);
+ }
+ else if (j_type == Y_ROT)
+ {
+ y_rot_backward(&cumul_mat[matAddrBase + (j) * M * M], &l_pos[0],
+ g_position, g_orientation, psum_grad[j_idx], axis_sign);
+ }
+ }
+ }
+
+ if (PARALLEL_WRITE)
+ {
+ // accumulate the partial sums across the 16 threads
+#pragma unroll
+
+ for (int16_t j = 0; j < njoints; j++)
+ {
+ psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 1);
+ psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 2);
+ psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 4);
+ psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 8);
+
+ // thread 0: psum_grad[j] will have the sum across 16 threads
+ // write out using only thread 0
+ }
+
+ const int16_t joints_per_thread = (njoints + 15) / 16;
+
+#pragma unroll
+
+ for (int16_t j = 0; j < joints_per_thread; j++)
+ {
+ // const int16_t j_idx = elem_idx * joints_per_thread + j;
+ const int16_t j_idx = elem_idx + j * 16;
+
+ if (j_idx >= njoints)
+ {
+ break;
+ }
+ grad_out_link_q[batch * njoints + j_idx] =
+ psum_grad[j_idx]; // write the sum to memory
+ }
+ }
+ else
+ {
+#pragma unroll
+
+ for (int16_t j = 0; j < njoints; j++)
+ {
+ psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 1);
+ psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 2);
+ psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 4);
+ psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 8);
+
+ // thread 0: psum_grad[j] will have the sum across 16 threads
+ // write out using only thread 0
+ }
+
+ if (elem_idx > 0)
+ {
+ return;
+ }
+
+#pragma unroll
+
+ for (int16_t j = 0; j < njoints; j++)
+ {
+ {
+ grad_out_link_q[batch * njoints + j] =
+ psum_grad[j]; // write the sum to memory
+ }
+ }
+ }
+
+ // accumulate the partial sums across the 16 threads
}
- }
- // accumulate the partial sums across the 16 threads
-}
-template
-__global__ void mat_to_quat_kernel(scalar_t *out_quat,
- const scalar_t *in_rot_mat,
- const int batch_size) {
- // Only works for float32
- const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
- if (batch_idx >= batch_size) {
- return;
- }
- float q[4] = {0.0}; // initialize array
+ template
+ __global__ void mat_to_quat_kernel(scalar_t *out_quat,
+ const scalar_t *in_rot_mat,
+ const int batch_size)
+ {
+ // Only works for float32
+ const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
- float rot[9];
+ if (batch_idx >= batch_size)
+ {
+ return;
+ }
+ float q[4] = { 0.0 }; // initialize array
- // read rot
- *(float3 *)&rot[0] = *(float3 *)&in_rot_mat[batch_idx * 9];
- *(float3 *)&rot[3] = *(float3 *)&in_rot_mat[batch_idx * 9 + 3];
- *(float3 *)&rot[6] = *(float3 *)&in_rot_mat[batch_idx * 9 + 6];
+ float rot[9];
- rot_to_quat(&rot[0], &q[0]);
+ // read rot
+ *(float3 *)&rot[0] = *(float3 *)&in_rot_mat[batch_idx * 9];
+ *(float3 *)&rot[3] = *(float3 *)&in_rot_mat[batch_idx * 9 + 3];
+ *(float3 *)&rot[6] = *(float3 *)&in_rot_mat[batch_idx * 9 + 6];
- // write quaternion:
- *(float4 *)&out_quat[batch_idx * 4] = *(float4 *)&q[0];
-}
-} // namespace Kinematics
-} // namespace Curobo
+ rot_to_quat(&rot[0], &q[0]);
-std::vector kin_fused_forward(
- torch::Tensor link_pos, torch::Tensor link_quat,
- torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
- const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
- const torch::Tensor robot_spheres, const torch::Tensor link_map,
- const torch::Tensor joint_map, const torch::Tensor joint_map_type,
- const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
- const int batch_size, const int n_spheres,
- const bool use_global_cumul = false) {
+ // write quaternion:
+ *(float4 *)&out_quat[batch_idx * 4] = *(float4 *)&q[0];
+ }
+ } // namespace Kinematics
+} // namespace Curobo
+
+std::vectorkin_fused_forward(
+ torch::Tensor link_pos, torch::Tensor link_quat,
+ torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+ const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+ const torch::Tensor robot_spheres, const torch::Tensor link_map,
+ const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+ const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+ const int batch_size, const int n_spheres,
+ const bool use_global_cumul = false)
+{
using namespace Curobo::Kinematics;
- const int n_joints = joint_vec.size(0) / batch_size;
- const int n_links = link_map.size(0);
+ const int n_joints = joint_vec.size(0) / batch_size;
+ const int n_links = link_map.size(0);
const int store_n_links = link_pos.size(1);
assert(joint_map.dtype() == torch::kInt16);
assert(joint_map_type.dtype() == torch::kInt8);
@@ -1043,63 +1212,70 @@ std::vector kin_fused_forward(
int batches_per_block = (int)((MAX_TOTAL_LINKS / n_links));
- if (batches_per_block == 0) {
+ if (batches_per_block == 0)
+ {
batches_per_block = 1;
}
- if (batches_per_block > MAX_BATCH_PER_BLOCK) {
+ if (batches_per_block > MAX_BATCH_PER_BLOCK)
+ {
batches_per_block = MAX_BATCH_PER_BLOCK;
}
- if (batches_per_block * M > 1024) {
+ if (batches_per_block * M > 1024)
+ {
batches_per_block = 1024 / (M);
}
+
// batches_per_block = 1;
const int threadsPerBlock = batches_per_block * M;
- const int blocksPerGrid =
- (batch_size * M + threadsPerBlock - 1) / threadsPerBlock;
+ const int blocksPerGrid =
+ (batch_size * M + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMemSize = batches_per_block * n_links * M * M * sizeof(float);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
- if (use_global_cumul) {
+ if (use_global_cumul)
+ {
AT_DISPATCH_FLOATING_TYPES(
- link_pos.scalar_type(), "kin_fused_forward", ([&] {
- kin_fused_warp_kernel2
- <<>>(
- link_pos.data_ptr(), link_quat.data_ptr(),
- batch_robot_spheres.data_ptr(),
- global_cumul_mat.data_ptr(),
- joint_vec.data_ptr(),
- fixed_transform.data_ptr(),
- robot_spheres.data_ptr(),
- joint_map_type.data_ptr(),
- joint_map.data_ptr(), link_map.data_ptr(),
- store_link_map.data_ptr(),
- link_sphere_map.data_ptr(), batch_size, n_spheres,
- n_links, n_joints, store_n_links);
- }));
- } else {
+ link_pos.scalar_type(), "kin_fused_forward", ([&] {
+ kin_fused_warp_kernel2
+ << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+ link_pos.data_ptr(), link_quat.data_ptr(),
+ batch_robot_spheres.data_ptr(),
+ global_cumul_mat.data_ptr(),
+ joint_vec.data_ptr(),
+ fixed_transform.data_ptr(),
+ robot_spheres.data_ptr(),
+ joint_map_type.data_ptr(),
+ joint_map.data_ptr(), link_map.data_ptr(),
+ store_link_map.data_ptr(),
+ link_sphere_map.data_ptr(), batch_size, n_spheres,
+ n_links, n_joints, store_n_links);
+ }));
+ }
+ else
+ {
AT_DISPATCH_FLOATING_TYPES(
- link_pos.scalar_type(), "kin_fused_forward", ([&] {
- kin_fused_warp_kernel2
- <<>>(
- link_pos.data_ptr(), link_quat.data_ptr(),
- batch_robot_spheres.data_ptr(),
- global_cumul_mat.data_ptr(),
- joint_vec.data_ptr(),
- fixed_transform.data_ptr(),
- robot_spheres.data_ptr(),
- joint_map_type.data_ptr(),
- joint_map.data_ptr(), link_map.data_ptr(),
- store_link_map.data_ptr(),
- link_sphere_map.data_ptr(), batch_size, n_spheres,
- n_links, n_joints, store_n_links);
- }));
+ link_pos.scalar_type(), "kin_fused_forward", ([&] {
+ kin_fused_warp_kernel2
+ << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+ link_pos.data_ptr(), link_quat.data_ptr(),
+ batch_robot_spheres.data_ptr(),
+ global_cumul_mat.data_ptr(),
+ joint_vec.data_ptr(),
+ fixed_transform.data_ptr(),
+ robot_spheres.data_ptr(),
+ joint_map_type.data_ptr(),
+ joint_map.data_ptr(), link_map.data_ptr(),
+ store_link_map.data_ptr(),
+ link_sphere_map.data_ptr(), batch_size, n_spheres,
+ n_links, n_joints, store_n_links);
+ }));
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
- return {link_pos, link_quat, batch_robot_spheres, global_cumul_mat};
+ return { link_pos, link_quat, batch_robot_spheres, global_cumul_mat };
}
/////////////////////////////////////////////
@@ -1108,48 +1284,55 @@ std::vector kin_fused_forward(
// This version is 30-100% faster compared to
// kin_fused_backward_4t.
/////////////////////////////////////////////
-std::vector kin_fused_backward_16t(
- torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
- const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
- const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
- const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
- const torch::Tensor link_map, const torch::Tensor joint_map,
- const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
- const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
- const int batch_size, const int n_spheres, const bool sparsity_opt = true,
- const bool use_global_cumul = false) {
+std::vectorkin_fused_backward_16t(
+ torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+ const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+ const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+ const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+ const torch::Tensor link_map, const torch::Tensor joint_map,
+ const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+ const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+ const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+ const bool use_global_cumul = false)
+{
using namespace Curobo::Kinematics;
- const int n_joints = joint_vec.size(0) / batch_size;
- const int n_links = link_map.size(0);
+ const int n_joints = joint_vec.size(0) / batch_size;
+ const int n_links = link_map.size(0);
const int store_n_links = store_link_map.size(0);
// assert(n_links < 128);
assert(n_joints < 128); // for larger num. of joints, change kernel3's
// MAX_JOINTS template value.
assert(n_links < MAX_TOTAL_LINKS);
+
// We need 16 threads per batch
// Find the maximum number of batches we can use per block:
//
int batches_per_block = (int)((MAX_TOTAL_LINKS / n_links));
- if (batches_per_block == 0) {
+
+ if (batches_per_block == 0)
+ {
batches_per_block = 1;
}
// To optimize for better occupancy, we might limit to MAX_BATCH_PER_BLOCK
- if (batches_per_block > MAX_BW_BATCH_PER_BLOCK) {
+ if (batches_per_block > MAX_BW_BATCH_PER_BLOCK)
+ {
batches_per_block = MAX_BW_BATCH_PER_BLOCK;
}
// we cannot have more than 1024 threads:
- if (batches_per_block * M * M > 1024) {
+ if (batches_per_block * M * M > 1024)
+ {
batches_per_block = 1024 / (M * M);
}
const int threadsPerBlock = batches_per_block * M * M;
const int blocksPerGrid =
- (batch_size * M * M + threadsPerBlock - 1) / threadsPerBlock;
+ (batch_size * M * M + threadsPerBlock - 1) / threadsPerBlock;
+
// assert to make sure n_joints, n_links < 128 to avoid overflow
// printf("threadsPerBlock: %d, blocksPerGRid: %d\n", threadsPerBlock,
// blocksPerGrid);
@@ -1158,103 +1341,111 @@ std::vector kin_fused_backward_16t(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
assert(sparsity_opt);
- if (use_global_cumul) {
- if (n_joints < 16) {
- AT_DISPATCH_FLOATING_TYPES(
- grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
- kin_fused_backward_kernel3
- <<>>(
- grad_out.data_ptr(),
- grad_nlinks_pos.data_ptr(),
- grad_nlinks_quat.data_ptr(),
- grad_spheres.data_ptr(),
- global_cumul_mat.data_ptr(),
- joint_vec.data_ptr(),
- fixed_transform.data_ptr(),
- robot_spheres.data_ptr(),
- joint_map_type.data_ptr(),
- joint_map.data_ptr(), link_map.data_ptr(),
- store_link_map.data_ptr(),
- link_sphere_map.data_ptr(),
- link_chain_map.data_ptr(), batch_size, n_spheres,
- n_links, n_joints, store_n_links);
- }));
- } else if (n_joints < 64) {
+ if (use_global_cumul)
+ {
+ if (n_joints < 16)
+ {
AT_DISPATCH_FLOATING_TYPES(
- grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
- kin_fused_backward_kernel3
- <<>>(
- grad_out.data_ptr(),
- grad_nlinks_pos.data_ptr(),
- grad_nlinks_quat.data_ptr(),
- grad_spheres.data_ptr(),
- global_cumul_mat.data_ptr(),
- joint_vec.data_ptr(),
- fixed_transform.data_ptr(),
- robot_spheres.data_ptr(),
- joint_map_type.data_ptr(),
- joint_map.data_ptr(), link_map.data_ptr(),
- store_link_map.data_ptr(),
- link_sphere_map.data_ptr(),
- link_chain_map.data_ptr(), batch_size, n_spheres,
- n_links, n_joints, store_n_links);
- }));
- } else {
- AT_DISPATCH_FLOATING_TYPES(
- grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
- kin_fused_backward_kernel3
- <<>>(
- grad_out.data_ptr(),
- grad_nlinks_pos.data_ptr(),
- grad_nlinks_quat.data_ptr(),
- grad_spheres.data_ptr(),
- global_cumul_mat.data_ptr(),
- joint_vec.data_ptr(),
- fixed_transform.data_ptr(),
- robot_spheres.data_ptr(),
- joint_map_type.data_ptr(),
- joint_map.data_ptr(), link_map.data_ptr(),
- store_link_map.data_ptr(),
- link_sphere_map.data_ptr(),
- link_chain_map.data_ptr(), batch_size, n_spheres,
- n_links, n_joints, store_n_links);
- }));
+ grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+ kin_fused_backward_kernel3
+ << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+ grad_out.data_ptr(),
+ grad_nlinks_pos.data_ptr(),
+ grad_nlinks_quat.data_ptr(),
+ grad_spheres.data_ptr(),
+ global_cumul_mat.data_ptr(),
+ joint_vec.data_ptr(),
+ fixed_transform.data_ptr(),
+ robot_spheres.data_ptr(),
+ joint_map_type.data_ptr(),
+ joint_map.data_ptr(), link_map.data_ptr(),
+ store_link_map.data_ptr(),
+ link_sphere_map.data_ptr(),
+ link_chain_map.data_ptr(), batch_size, n_spheres,
+ n_links, n_joints, store_n_links);
+ }));
+ }
+ else if (n_joints < 64)
+ {
+ AT_DISPATCH_FLOATING_TYPES(
+ grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+ kin_fused_backward_kernel3
+ << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+ grad_out.data_ptr(),
+ grad_nlinks_pos.data_ptr(),
+ grad_nlinks_quat.data_ptr(),
+ grad_spheres.data_ptr(),
+ global_cumul_mat.data_ptr(),
+ joint_vec.data_ptr(),
+ fixed_transform.data_ptr(),
+ robot_spheres.data_ptr(),
+ joint_map_type.data_ptr(),
+ joint_map.data_ptr(), link_map.data_ptr(),
+ store_link_map.data_ptr(),
+ link_sphere_map.data_ptr(),
+ link_chain_map.data_ptr(), batch_size, n_spheres,
+ n_links, n_joints, store_n_links);
+ }));
+ }
+ else
+ {
+ AT_DISPATCH_FLOATING_TYPES(
+ grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+ kin_fused_backward_kernel3
+ << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+ grad_out.data_ptr(),
+ grad_nlinks_pos.data_ptr(),
+ grad_nlinks_quat.data_ptr(),
+ grad_spheres.data_ptr(),
+ global_cumul_mat.data_ptr(),
+ joint_vec.data_ptr(),
+ fixed_transform.data_ptr(),
+ robot_spheres.data_ptr(),
+ joint_map_type.data_ptr(),
+ joint_map.data_ptr(), link_map.data_ptr(),
+ store_link_map.data_ptr(),
+ link_sphere_map.data_ptr(),
+ link_chain_map.data_ptr(), batch_size, n_spheres,
+ n_links, n_joints, store_n_links);
+ }));
}
- //
- } else {
+ //
+ }
+ else
+ {
//
AT_DISPATCH_FLOATING_TYPES(
- grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
- kin_fused_backward_kernel3
- <<>>(
- grad_out.data_ptr(),
- grad_nlinks_pos.data_ptr(),
- grad_nlinks_quat.data_ptr(),
- grad_spheres.data_ptr(),
- global_cumul_mat.data_ptr(),
- joint_vec.data_ptr(),
- fixed_transform.data_ptr(),
- robot_spheres.data_ptr(),
- joint_map_type.data_ptr(),
- joint_map.data_ptr(), link_map.data_ptr(),
- store_link_map.data_ptr(),
- link_sphere_map.data_ptr(),
- link_chain_map.data_ptr(), batch_size, n_spheres,
- n_links, n_joints, store_n_links);
- }));
+ grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+ kin_fused_backward_kernel3
+ << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
+ grad_out.data_ptr(),
+ grad_nlinks_pos.data_ptr