release repository
This commit is contained in:
11
src/curobo/curobolib/__init__.py
Normal file
11
src/curobo/curobolib/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
"""CuRoboLib Module."""
|
||||
260
src/curobo/curobolib/cpp/geom_cuda.cpp
Normal file
260
src/curobo/curobolib/cpp/geom_cuda.cpp
Normal file
@@ -0,0 +1,260 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include <map>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <vector>
|
||||
|
||||
// CUDA forward declarations
|
||||
|
||||
std::vector<torch::Tensor> self_collision_distance(
|
||||
torch::Tensor out_distance, torch::Tensor out_vec,
|
||||
torch::Tensor sparse_index,
|
||||
const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
|
||||
const torch::Tensor collision_offset, // n_spheres x n_spheres
|
||||
const torch::Tensor weight,
|
||||
const torch::Tensor collision_matrix, // n_spheres x n_spheres
|
||||
const torch::Tensor thread_locations, const int locations_size,
|
||||
const int batch_size, const int nspheres, const bool compute_grad = false,
|
||||
const int ndpt = 8, // Does this need to match template?
|
||||
const bool debug = false);
|
||||
|
||||
// CUDA forward declarations
|
||||
|
||||
std::vector<torch::Tensor> swept_sphere_obb_clpt(
|
||||
const torch::Tensor sphere_position, // batch_size, 3
|
||||
torch::Tensor distance, // batch_size, 1
|
||||
torch::Tensor
|
||||
closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
|
||||
torch::Tensor sparsity_idx, const torch::Tensor weight,
|
||||
const torch::Tensor activation_distance, const torch::Tensor speed_dt,
|
||||
const torch::Tensor obb_accel, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_bounds, // n_boxes, 3
|
||||
const torch::Tensor obb_pose, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_enable, // n_boxes, 4,
|
||||
const torch::Tensor n_env_obb, // n_boxes, 4, 4
|
||||
const torch::Tensor env_query_idx, // n_boxes, 4, 4
|
||||
const int max_nobs, const int batch_size, const int horizon,
|
||||
const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
|
||||
const bool transform_back, const bool compute_distance,
|
||||
const bool use_batch_env);
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
|
||||
torch::Tensor distance,
|
||||
torch::Tensor closest_point, // batch size, 3
|
||||
torch::Tensor sparsity_idx, const torch::Tensor weight,
|
||||
const torch::Tensor activation_distance,
|
||||
const torch::Tensor obb_accel, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_bounds, // n_boxes, 3
|
||||
const torch::Tensor obb_pose, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_enable, // n_boxes, 4, 4
|
||||
const torch::Tensor n_env_obb, // n_boxes, 4, 4
|
||||
const torch::Tensor env_query_idx, // n_boxes, 4, 4
|
||||
const int max_nobs, const int batch_size, const int horizon,
|
||||
const int n_spheres, const bool transform_back,
|
||||
const bool compute_distance, const bool use_batch_env);
|
||||
|
||||
std::vector<torch::Tensor> pose_distance(
|
||||
torch::Tensor out_distance, torch::Tensor out_position_distance,
|
||||
torch::Tensor out_rotation_distance,
|
||||
torch::Tensor distance_p_vector, // batch size, 3
|
||||
torch::Tensor distance_q_vector, // batch size, 4
|
||||
torch::Tensor out_gidx,
|
||||
const torch::Tensor current_position, // batch_size, 3
|
||||
const torch::Tensor goal_position, // n_boxes, 3
|
||||
const torch::Tensor current_quat, const torch::Tensor goal_quat,
|
||||
const torch::Tensor vec_weight, // n_boxes, 4, 4
|
||||
const torch::Tensor weight, // n_boxes, 4, 4
|
||||
const torch::Tensor vec_convergence, const torch::Tensor run_weight,
|
||||
const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
|
||||
const int batch_size, const int horizon, const int mode,
|
||||
const int num_goals = 1, const bool compute_grad = false,
|
||||
const bool write_distance = true, const bool use_metric = false);
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
|
||||
const torch::Tensor grad_distance, // batch_size, 3
|
||||
const torch::Tensor grad_p_distance, // n_boxes, 3
|
||||
const torch::Tensor grad_q_distance,
|
||||
const torch::Tensor pose_weight,
|
||||
const torch::Tensor grad_p_vec, // n_boxes, 4, 4
|
||||
const torch::Tensor grad_q_vec, const int batch_size,
|
||||
const bool use_distance = false);
|
||||
|
||||
// C++ interface
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
std::vector<torch::Tensor> self_collision_distance_wrapper(
|
||||
torch::Tensor out_distance, torch::Tensor out_vec,
|
||||
torch::Tensor sparse_index,
|
||||
const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
|
||||
const torch::Tensor collision_offset, // n_spheres
|
||||
const torch::Tensor weight,
|
||||
const torch::Tensor collision_matrix, // n_spheres
|
||||
const torch::Tensor thread_locations, const int thread_locations_size,
|
||||
const int batch_size, const int nspheres, const bool compute_grad = false,
|
||||
const int ndpt = 8, const bool debug = false) {
|
||||
|
||||
CHECK_INPUT(out_distance);
|
||||
CHECK_INPUT(out_vec);
|
||||
CHECK_INPUT(robot_spheres);
|
||||
CHECK_INPUT(collision_offset);
|
||||
CHECK_INPUT(sparse_index);
|
||||
CHECK_INPUT(weight);
|
||||
CHECK_INPUT(thread_locations);
|
||||
CHECK_INPUT(collision_matrix);
|
||||
const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());
|
||||
|
||||
return self_collision_distance(
|
||||
out_distance, out_vec, sparse_index, robot_spheres,
|
||||
collision_offset, weight, collision_matrix, thread_locations,
|
||||
thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
|
||||
const torch::Tensor sphere_position, // batch_size, 4
|
||||
|
||||
torch::Tensor distance,
|
||||
torch::Tensor closest_point, // batch size, 3
|
||||
torch::Tensor sparsity_idx, const torch::Tensor weight,
|
||||
const torch::Tensor activation_distance,
|
||||
const torch::Tensor obb_accel, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_bounds, // n_boxes, 3
|
||||
const torch::Tensor obb_pose, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_enable, // n_boxes, 4, 4
|
||||
const torch::Tensor n_env_obb, // n_boxes, 4, 4
|
||||
const torch::Tensor env_query_idx, // n_boxes, 4, 4
|
||||
const int max_nobs, const int batch_size, const int horizon,
|
||||
const int n_spheres, const bool transform_back, const bool compute_distance,
|
||||
const bool use_batch_env) {
|
||||
|
||||
const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
|
||||
CHECK_INPUT(distance);
|
||||
CHECK_INPUT(closest_point);
|
||||
CHECK_INPUT(sphere_position);
|
||||
CHECK_INPUT(sparsity_idx);
|
||||
CHECK_INPUT(weight);
|
||||
CHECK_INPUT(activation_distance);
|
||||
CHECK_INPUT(obb_accel);
|
||||
return sphere_obb_clpt(
|
||||
sphere_position, distance, closest_point, sparsity_idx, weight,
|
||||
activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
|
||||
n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
|
||||
transform_back, compute_distance, use_batch_env);
|
||||
}
|
||||
std::vector<torch::Tensor> swept_sphere_obb_clpt_wrapper(
|
||||
const torch::Tensor sphere_position, // batch_size, 4
|
||||
torch::Tensor distance,
|
||||
torch::Tensor closest_point, // batch size, 3
|
||||
torch::Tensor sparsity_idx, const torch::Tensor weight,
|
||||
const torch::Tensor activation_distance, const torch::Tensor speed_dt,
|
||||
const torch::Tensor obb_accel, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_bounds, // n_boxes, 3
|
||||
const torch::Tensor obb_pose, // n_boxes, 4, 4
|
||||
const torch::Tensor obb_enable, // n_boxes, 4, 4
|
||||
const torch::Tensor n_env_obb, // n_boxes, 4, 4
|
||||
const torch::Tensor env_query_idx, // n_boxes, 4, 4
|
||||
const int max_nobs, const int batch_size, const int horizon,
|
||||
const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
|
||||
const bool transform_back, const bool compute_distance,
|
||||
const bool use_batch_env) {
|
||||
const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
|
||||
CHECK_INPUT(distance);
|
||||
CHECK_INPUT(closest_point);
|
||||
CHECK_INPUT(sphere_position);
|
||||
|
||||
return swept_sphere_obb_clpt(
|
||||
sphere_position,
|
||||
distance, closest_point, sparsity_idx, weight, activation_distance,
|
||||
speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
|
||||
env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
|
||||
enable_speed_metric, transform_back, compute_distance, use_batch_env);
|
||||
}
|
||||
std::vector<torch::Tensor> pose_distance_wrapper(
|
||||
torch::Tensor out_distance, torch::Tensor out_position_distance,
|
||||
torch::Tensor out_rotation_distance,
|
||||
torch::Tensor distance_p_vector, // batch size, 3
|
||||
torch::Tensor distance_q_vector, // batch size, 4
|
||||
torch::Tensor out_gidx,
|
||||
const torch::Tensor current_position, // batch_size, 3
|
||||
const torch::Tensor goal_position, // n_boxes, 3
|
||||
const torch::Tensor current_quat, const torch::Tensor goal_quat,
|
||||
const torch::Tensor vec_weight, // n_boxes, 4, 4
|
||||
const torch::Tensor weight, const torch::Tensor vec_convergence,
|
||||
const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
|
||||
const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
|
||||
const int mode, const int num_goals = 1, const bool compute_grad = false,
|
||||
const bool write_distance = false, const bool use_metric = false) {
|
||||
|
||||
// at::cuda::DeviceGuard guard(angle.device());
|
||||
CHECK_INPUT(out_distance);
|
||||
CHECK_INPUT(out_position_distance);
|
||||
CHECK_INPUT(out_rotation_distance);
|
||||
CHECK_INPUT(distance_p_vector);
|
||||
CHECK_INPUT(distance_q_vector);
|
||||
CHECK_INPUT(current_position);
|
||||
CHECK_INPUT(goal_position);
|
||||
CHECK_INPUT(current_quat);
|
||||
CHECK_INPUT(goal_quat);
|
||||
CHECK_INPUT(batch_pose_idx);
|
||||
const at::cuda::OptionalCUDAGuard guard(current_position.device());
|
||||
|
||||
return pose_distance(
|
||||
out_distance, out_position_distance, out_rotation_distance,
|
||||
distance_p_vector, distance_q_vector, out_gidx, current_position,
|
||||
goal_position, current_quat, goal_quat, vec_weight, weight,
|
||||
vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
|
||||
horizon, mode, num_goals, compute_grad, write_distance, use_metric);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> backward_pose_distance_wrapper(
|
||||
torch::Tensor out_grad_p, torch::Tensor out_grad_q,
|
||||
const torch::Tensor grad_distance, // batch_size, 3
|
||||
const torch::Tensor grad_p_distance, // n_boxes, 3
|
||||
const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
|
||||
const torch::Tensor grad_p_vec, // n_boxes, 4, 4
|
||||
const torch::Tensor grad_q_vec, const int batch_size,
|
||||
const bool use_distance) {
|
||||
CHECK_INPUT(out_grad_p);
|
||||
CHECK_INPUT(out_grad_q);
|
||||
CHECK_INPUT(grad_distance);
|
||||
CHECK_INPUT(grad_p_distance);
|
||||
CHECK_INPUT(grad_q_distance);
|
||||
|
||||
const at::cuda::OptionalCUDAGuard guard(grad_distance.device());
|
||||
|
||||
return backward_pose_distance(
|
||||
out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
|
||||
pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
|
||||
m.def("pose_distance_backward", &backward_pose_distance_wrapper,
|
||||
"Pose Distance Backward (curobolib)");
|
||||
|
||||
m.def("closest_point", &sphere_obb_clpt_wrapper,
|
||||
"Closest Point OBB(curobolib)");
|
||||
m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
|
||||
"Swept Closest Point OBB(curobolib)");
|
||||
|
||||
m.def("self_collision_distance", &self_collision_distance_wrapper,
|
||||
"Self Collision Distance (curobolib)");
|
||||
}
|
||||
1453
src/curobo/curobolib/cpp/helper_math.h
Normal file
1453
src/curobo/curobolib/cpp/helper_math.h
Normal file
File diff suppressed because it is too large
Load Diff
106
src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
Normal file
106
src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <vector>
|
||||
|
||||
// CUDA forward declarations
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
matrix_to_quaternion(torch::Tensor out_quat,
|
||||
const torch::Tensor in_rot // batch_size, 3
|
||||
);
|
||||
|
||||
std::vector<torch::Tensor> kin_fused_forward(
|
||||
torch::Tensor link_pos, torch::Tensor link_quat,
|
||||
torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
|
||||
const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
|
||||
const torch::Tensor robot_spheres, const torch::Tensor link_map,
|
||||
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
|
||||
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
|
||||
const int batch_size, const int n_spheres,
|
||||
const bool use_global_cumul = false);
|
||||
|
||||
std::vector<torch::Tensor> kin_fused_backward_16t(
|
||||
torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
|
||||
const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
|
||||
const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
|
||||
const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
|
||||
const torch::Tensor link_map, const torch::Tensor joint_map,
|
||||
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
|
||||
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
|
||||
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
|
||||
const bool use_global_cumul = false);
|
||||
// C++ interface
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
std::vector<torch::Tensor> kin_forward_wrapper(
|
||||
torch::Tensor link_pos, torch::Tensor link_quat,
|
||||
torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
|
||||
const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
|
||||
const torch::Tensor robot_spheres, const torch::Tensor link_map,
|
||||
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
|
||||
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
|
||||
const int batch_size, const int n_spheres,
|
||||
const bool use_global_cumul = false) {
|
||||
|
||||
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
|
||||
|
||||
// TODO: add check input
|
||||
return kin_fused_forward(
|
||||
link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
|
||||
fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
|
||||
store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> kin_backward_wrapper(
|
||||
torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
|
||||
const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
|
||||
const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
|
||||
const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
|
||||
const torch::Tensor link_map, const torch::Tensor joint_map,
|
||||
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
|
||||
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
|
||||
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
|
||||
const bool use_global_cumul = false) {
|
||||
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
|
||||
|
||||
return kin_fused_backward_16t(
|
||||
grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
|
||||
global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
|
||||
joint_map, joint_map_type, store_link_map, link_sphere_map,
|
||||
link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
matrix_to_quaternion_wrapper(torch::Tensor out_quat,
|
||||
const torch::Tensor in_rot // batch_size, 3
|
||||
) {
|
||||
const at::cuda::OptionalCUDAGuard guard(in_rot.device());
|
||||
CHECK_INPUT(in_rot);
|
||||
CHECK_INPUT(out_quat);
|
||||
return matrix_to_quaternion(out_quat, in_rot);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
|
||||
m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
|
||||
m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
|
||||
"Rotation Matrix to Quaternion (CUDA)");
|
||||
}
|
||||
1255
src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
Normal file
1255
src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
Normal file
File diff suppressed because it is too large
Load Diff
121
src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
Normal file
121
src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
Normal file
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
// CUDA forward declarations
|
||||
std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
|
||||
torch::Tensor rho_buffer,
|
||||
torch::Tensor sum, const int batch_size,
|
||||
const int v_dim, const int m);
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
torch::Tensor y_buffer, torch::Tensor s_buffer,
|
||||
torch::Tensor grad_q, const float epsilon, const int batch_size,
|
||||
const int m, const int v_dim);
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
|
||||
torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
|
||||
torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
|
||||
const int m, const int v_dim);
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
|
||||
torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
|
||||
const float epsilon, const int batch_size, const int m,
|
||||
const int v_dim, const bool stable_mode);
|
||||
// C++ interface
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
torch::Tensor y_buffer, torch::Tensor s_buffer,
|
||||
torch::Tensor grad_q, const float epsilon, const int batch_size,
|
||||
const int m, const int v_dim) {
|
||||
|
||||
CHECK_INPUT(step_vec);
|
||||
CHECK_INPUT(rho_buffer);
|
||||
CHECK_INPUT(y_buffer);
|
||||
CHECK_INPUT(s_buffer);
|
||||
CHECK_INPUT(grad_q);
|
||||
const at::cuda::OptionalCUDAGuard guard(grad_q.device());
|
||||
|
||||
return lbfgs_step_cuda(step_vec, rho_buffer, y_buffer, s_buffer, grad_q,
|
||||
epsilon, batch_size, m, v_dim);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
|
||||
torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
|
||||
torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
|
||||
const int m, const int v_dim) {
|
||||
CHECK_INPUT(rho_buffer);
|
||||
CHECK_INPUT(y_buffer);
|
||||
CHECK_INPUT(s_buffer);
|
||||
CHECK_INPUT(grad_q);
|
||||
CHECK_INPUT(x_0);
|
||||
CHECK_INPUT(grad_0);
|
||||
CHECK_INPUT(q);
|
||||
const at::cuda::OptionalCUDAGuard guard(grad_q.device());
|
||||
|
||||
return lbfgs_update_cuda(rho_buffer, y_buffer, s_buffer, q, grad_q, x_0,
|
||||
grad_0, batch_size, m, v_dim);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
|
||||
torch::Tensor rho_buffer, torch::Tensor sum,
|
||||
const int batch_size, const int v_dim, const int m) {
|
||||
const at::cuda::OptionalCUDAGuard guard(sum.device());
|
||||
|
||||
return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
|
||||
torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
|
||||
const float epsilon, const int batch_size, const int m,
|
||||
const int v_dim, const bool stable_mode) {
|
||||
CHECK_INPUT(step_vec);
|
||||
CHECK_INPUT(rho_buffer);
|
||||
CHECK_INPUT(y_buffer);
|
||||
CHECK_INPUT(s_buffer);
|
||||
CHECK_INPUT(grad_q);
|
||||
CHECK_INPUT(x_0);
|
||||
CHECK_INPUT(grad_0);
|
||||
CHECK_INPUT(q);
|
||||
const at::cuda::OptionalCUDAGuard guard(grad_q.device());
|
||||
|
||||
return lbfgs_cuda_fuse(step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q,
|
||||
x_0, grad_0, epsilon, batch_size, m, v_dim,
|
||||
stable_mode);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)");
|
||||
m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)");
|
||||
m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)");
|
||||
m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug");
|
||||
}
|
||||
869
src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
Normal file
869
src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
Normal file
@@ -0,0 +1,869 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <cuda.h>
|
||||
#include <torch/extension.h>
|
||||
#include <vector>
|
||||
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
// #include "helper_cuda.h"
|
||||
#include "helper_math.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <math.h>
|
||||
|
||||
// #include <stdio.h>
|
||||
//
|
||||
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||
// #include <cuda_runtime.h>
|
||||
|
||||
// #include <cuda_fp16.h>
|
||||
// #include <helper_cuda.h>
|
||||
#define M_MAX 512
|
||||
#define HALF_MAX 65504.0
|
||||
#define M 15
|
||||
#define VDIM 175 // 25 * 7,
|
||||
|
||||
#define FULL_MASK 0xffffffff
|
||||
|
||||
namespace Curobo {
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ __forceinline__ void
|
||||
scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out,
|
||||
const int v_dim) {
|
||||
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
out[i] = a * b[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ __forceinline__ void
|
||||
m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out,
|
||||
const int v_dim, const int m) {
|
||||
for (int j = 0; j < m; j++) {
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
out[j * v_dim + i] = a[j] * b[j * v_dim + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ __forceinline__ void vec_vec_dot(const scalar_t *a,
|
||||
const scalar_t *b, scalar_t &out,
|
||||
const int v_dim) {
|
||||
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
out += a[i] * b[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ __forceinline__ void update_r(const scalar_t *rho_y,
|
||||
const scalar_t *s_buffer, scalar_t *r,
|
||||
scalar_t &alpha, const int v_dim) {
|
||||
// dot product: and subtract with alpha
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
alpha -= rho_y[i] * r[i];
|
||||
}
|
||||
// scalar vector product:
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
r[i] = r[i] + alpha * s_buffer[i];
|
||||
}
|
||||
}
|
||||
template <typename scalar_t>
|
||||
__device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq,
|
||||
const scalar_t alpha,
|
||||
const int v_dim) {
|
||||
//
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
gq[i] = gq[i] - (alpha * y_buffer[i]);
|
||||
}
|
||||
}
|
||||
template <typename scalar_t>
|
||||
__global__ void
|
||||
lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer,
|
||||
const scalar_t *y_buffer, const scalar_t *s_buffer,
|
||||
const scalar_t *grad_q, const float epsilon,
|
||||
const int batchSize, const int m, const int v_dim) {
|
||||
// each thread writes one sphere of some link
|
||||
const int t = blockDim.x * blockIdx.x + threadIdx.x; // batch
|
||||
const int b_idx = t;
|
||||
if (t >= (batchSize)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// get thread start address:
|
||||
const int b_start_scalar_adrs = b_idx * m;
|
||||
const int b_start_vec_adrs = b_idx * m * v_dim;
|
||||
const int b_step_start_adrs = b_idx * v_dim;
|
||||
|
||||
scalar_t rho_s[M * VDIM];
|
||||
|
||||
// copy floats to local buffer?
|
||||
// y_buffer, s_buffer, rho_buffer
|
||||
// compute rho_s
|
||||
scalar_t loc_ybuf[M * VDIM];
|
||||
scalar_t loc_sbuf[M * VDIM];
|
||||
scalar_t loc_rho[M];
|
||||
scalar_t gq[VDIM]; //, l_q[VDIM];
|
||||
scalar_t alpha_buffer[M];
|
||||
scalar_t t_1, t_2;
|
||||
|
||||
for (int i = 0; i < m * v_dim; i++) {
|
||||
loc_ybuf[i] = y_buffer[b_start_vec_adrs + i];
|
||||
loc_sbuf[i] = s_buffer[b_start_vec_adrs + i];
|
||||
}
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
gq[i] = grad_q[b_step_start_adrs + i];
|
||||
}
|
||||
for (int i = 0; i < m; i++) {
|
||||
loc_rho[i] = rho_buffer[b_start_scalar_adrs + i];
|
||||
}
|
||||
|
||||
m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m);
|
||||
// for loop over m
|
||||
for (int i = m - 1; i > m - 2; i--) {
|
||||
// l_start_vec_adrs = i * v_dim;
|
||||
// scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim],
|
||||
// v_dim);
|
||||
vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim);
|
||||
update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim);
|
||||
}
|
||||
// compute initial hessian:
|
||||
vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1,
|
||||
v_dim);
|
||||
vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2,
|
||||
v_dim);
|
||||
t_1 = t_1 / t_2;
|
||||
if (t_1 < 0) {
|
||||
t_1 = 0;
|
||||
}
|
||||
|
||||
t_1 += epsilon;
|
||||
scalar_vec_product(t_1, &gq[0], &gq[0], v_dim);
|
||||
|
||||
m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m);
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
// scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim],
|
||||
// v_dim);
|
||||
update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i],
|
||||
v_dim);
|
||||
}
|
||||
|
||||
// write gq to out grad:
|
||||
|
||||
for (int i = 0; i < v_dim; i++) {
|
||||
step_vec[b_step_start_adrs + i] = -1.0 * gq[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename psum_t>
|
||||
__forceinline__ __device__ psum_t warpReduce(psum_t v, int elems,
|
||||
unsigned mask) {
|
||||
psum_t val = v;
|
||||
int shift = 1;
|
||||
for (int i = elems; i > 1; i /= 2) {
|
||||
val += __shfl_down_sync(mask, val, shift);
|
||||
shift *= 2;
|
||||
}
|
||||
// val += __shfl_down_sync(mask, val, 1); // i=32
|
||||
// val += __shfl_down_sync(mask, val, 2); // i=16
|
||||
// val += __shfl_down_sync(mask, val, 4); // i=8
|
||||
// val += __shfl_down_sync(mask, val, 8); // i=4
|
||||
// val += __shfl_down_sync(mask, val, 16); // i=2
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data,
|
||||
scalar_t *result) {
|
||||
psum_t val = v;
|
||||
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
|
||||
val += __shfl_down_sync(mask, val, 1);
|
||||
val += __shfl_down_sync(mask, val, 2);
|
||||
val += __shfl_down_sync(mask, val, 4);
|
||||
val += __shfl_down_sync(mask, val, 8);
|
||||
val += __shfl_down_sync(mask, val, 16);
|
||||
// int leader = __ffs(mask) – 1; // select a leader lane
|
||||
int leader = 0;
|
||||
if (threadIdx.x % 32 == leader) {
|
||||
if (m < 32) {
|
||||
result[0] = (scalar_t)val;
|
||||
} else {
|
||||
data[(threadIdx.x + 1) / 32] = val;
|
||||
}
|
||||
}
|
||||
if (m >= 32) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int elems = (m + 31) / 32;
|
||||
unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
|
||||
if (threadIdx.x / 32 == 0) { // only the first warp will do this work
|
||||
psum_t val2 = data[threadIdx.x % 32];
|
||||
int shift = 1;
|
||||
for (int i = elems - 1; i > 0; i /= 2) {
|
||||
val2 += __shfl_down_sync(mask2, val2, shift);
|
||||
shift *= 2;
|
||||
}
|
||||
// int leader = __ffs(mask2) – 1; // select a leader lane
|
||||
int leader = 0;
|
||||
if (threadIdx.x % 32 == leader) {
|
||||
result[0] = (scalar_t)val2;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// blockReduce
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data,
|
||||
scalar_t *result) {
|
||||
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
|
||||
psum_t val = warpReduce(v, 32, mask);
|
||||
|
||||
// int leader = __ffs(mask) – 1; // select a leader lane
|
||||
int leader = 0;
|
||||
if (threadIdx.x % 32 == leader) {
|
||||
data[(threadIdx.x + 1) / 32] = val;
|
||||
}
|
||||
|
||||
if (m >= 32) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int elems = (m + 31) / 32;
|
||||
unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
|
||||
if (threadIdx.x / 32 == 0) { // only the first warp will do this work
|
||||
psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2);
|
||||
|
||||
// // int leader = __ffs(mask2) – 1; // select a leader lane
|
||||
if (threadIdx.x == leader) {
|
||||
result[0] = (scalar_t)val2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (threadIdx.x == leader) {
|
||||
result[0] = (scalar_t)val;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2,
|
||||
const int m, psum_t *data, scalar_t *result) {
|
||||
|
||||
scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x];
|
||||
reduce(val, m, data, result);
|
||||
}
|
||||
|
||||
template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
|
||||
if (var < 0)
|
||||
return 0;
|
||||
else
|
||||
return var;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// one block per batch
|
||||
// v_dim threads per block
|
||||
//////////////////////////////////////////////////////////
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__global__ void lbfgs_step_kernel(scalar_t *step_vec, // b x 175
|
||||
scalar_t *rho_buffer, // m x b x 1
|
||||
const scalar_t *y_buffer, // m x b x 175
|
||||
const scalar_t *s_buffer, // m x b x 175
|
||||
const scalar_t *grad_q, // b x 175
|
||||
const float epsilon, const int batchsize,
|
||||
const int m, const int v_dim) {
|
||||
__shared__ psum_t
|
||||
data[32]; // temporary buffer needed for block-wide reduction
|
||||
__shared__ scalar_t
|
||||
result; // result of the reduction or vector-vector dot product
|
||||
__shared__ scalar_t gq[175]; /// gq = batch * v_dim
|
||||
assert(v_dim < 176);
|
||||
int batch = blockIdx.x; // one block per batch
|
||||
if (threadIdx.x >= v_dim)
|
||||
return;
|
||||
|
||||
gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
|
||||
|
||||
scalar_t alpha_buffer[16];
|
||||
// assert(m<16); // allocating a buffer assuming m < 16
|
||||
|
||||
for (int i = m - 1; i > -1; i--) {
|
||||
dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
|
||||
&data[0], &result);
|
||||
alpha_buffer[i] = result * rho_buffer[i * batchsize + batch];
|
||||
gq[threadIdx.x] =
|
||||
gq[threadIdx.x] -
|
||||
alpha_buffer[i] *
|
||||
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
}
|
||||
|
||||
// compute var1
|
||||
scalar_t val1 =
|
||||
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
scalar_t val2 =
|
||||
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
reduce(val1 * val1, v_dim, data, &result);
|
||||
scalar_t denominator = result;
|
||||
reduce(val1 * val2, v_dim, data, &result);
|
||||
scalar_t numerator = result;
|
||||
scalar_t var1 = numerator / denominator;
|
||||
|
||||
scalar_t gamma = relu(var1) + epsilon; // epsilon
|
||||
|
||||
gq[threadIdx.x] = gamma * gq[threadIdx.x];
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
|
||||
&data[0], &result);
|
||||
gq[threadIdx.x] =
|
||||
gq[threadIdx.x] +
|
||||
(alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) *
|
||||
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
}
|
||||
|
||||
step_vec[batch * v_dim + threadIdx.x] =
|
||||
-1 * gq[threadIdx.x]; // copy from shared memory to global memory
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer, // m x b x 1
|
||||
scalar_t *y_buffer, // m x b x 175
|
||||
scalar_t *s_buffer, // m x b x 175
|
||||
scalar_t *q, // b x 175
|
||||
scalar_t *x_0, // b x 175
|
||||
scalar_t *grad_0, // b x 175
|
||||
const scalar_t *grad_q, // b x 175
|
||||
const int batchsize, const int m,
|
||||
const int v_dim) {
|
||||
__shared__ psum_t
|
||||
data[32]; // temporary buffer needed for block-wide reduction
|
||||
__shared__ scalar_t
|
||||
result; // result of the reduction or vector-vector dot product
|
||||
// __shared__ scalar_t y[175]; // temporary shared memory storage
|
||||
// __shared__ scalar_t s[175]; // temporary shared memory storage
|
||||
assert(v_dim <= VDIM);
|
||||
int batch = blockIdx.x; // one block per batch
|
||||
if (threadIdx.x >= v_dim)
|
||||
return;
|
||||
|
||||
scalar_t y =
|
||||
grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x];
|
||||
scalar_t s =
|
||||
q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
|
||||
reduce(y * s, v_dim, &data[0], &result);
|
||||
|
||||
for (int i = 1; i < m; i++) {
|
||||
s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
|
||||
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
|
||||
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
}
|
||||
|
||||
// __syncthreads();
|
||||
|
||||
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
|
||||
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
|
||||
grad_0[batch * v_dim + threadIdx.x] = grad_q[batch * v_dim + threadIdx.x];
|
||||
x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
scalar_t rho = 1 / result;
|
||||
for (int i = 1; i < m; i++) {
|
||||
rho_buffer[(i - 1) * batchsize + batch] =
|
||||
rho_buffer[i * batchsize + batch];
|
||||
}
|
||||
rho_buffer[(m - 1) * batchsize + batch] = rho;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__global__ void reduce_kernel(
|
||||
scalar_t *vec1, // b x 175
|
||||
scalar_t *vec2, // b x 175
|
||||
scalar_t *rho_buffer, // m x b x 1
|
||||
|
||||
scalar_t *sum_out, // m x b x 1
|
||||
|
||||
const int batchsize, const int m,
|
||||
const int v_dim) // s_buffer and y_buffer are not rolled by default
|
||||
{
|
||||
__shared__ psum_t
|
||||
data[32]; // temporary buffer needed for block-wide reduction
|
||||
__shared__ scalar_t
|
||||
result; // result of the reduction or vector-vector dot product
|
||||
int batch = blockIdx.x; // one block per batch
|
||||
if (threadIdx.x >= v_dim)
|
||||
return;
|
||||
|
||||
////////////////////
|
||||
// update_buffer
|
||||
////////////////////
|
||||
scalar_t y = vec1[batch * v_dim + threadIdx.x];
|
||||
scalar_t s = vec2[batch * v_dim + threadIdx.x];
|
||||
|
||||
reduce(y * s, v_dim, &data[0], &result);
|
||||
scalar_t numerator = result;
|
||||
if (threadIdx.x == 0) {
|
||||
sum_out[batch] = 1 / numerator;
|
||||
}
|
||||
// return;
|
||||
if (threadIdx.x < m - 1) {
|
||||
// m thread participate to shif the values
|
||||
// this is safe as m<32 and this happens in lockstep
|
||||
rho_buffer[threadIdx.x * batchsize + batch] =
|
||||
rho_buffer[(threadIdx.x + 1) * batchsize + batch];
|
||||
} else if (threadIdx.x == m - 1) {
|
||||
scalar_t rho = (1 / numerator);
|
||||
rho_buffer[threadIdx.x * batchsize + batch] = rho;
|
||||
}
|
||||
}
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__global__ void lbfgs_update_buffer_and_step_v1(
|
||||
scalar_t *step_vec, // b x 175
|
||||
scalar_t *rho_buffer, // m x b x 1
|
||||
scalar_t *y_buffer, // m x b x 175
|
||||
scalar_t *s_buffer, // m x b x 175
|
||||
scalar_t *q, // b x 175
|
||||
scalar_t *x_0, // b x 175
|
||||
scalar_t *grad_0, // b x 175
|
||||
const scalar_t *grad_q, // b x 175
|
||||
const float epsilon, const int batchsize, const int m, const int v_dim,
|
||||
const bool rolled_ys = false,
|
||||
const bool stable_mode =
|
||||
false) // s_buffer and y_buffer are not rolled by default
|
||||
{
|
||||
// extern __shared__ scalar_t alpha_buffer_sh[];
|
||||
extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
|
||||
scalar_t *my_smem_rc = reinterpret_cast<scalar_t *>(my_smem);
|
||||
scalar_t *alpha_buffer_sh = &my_smem_rc[0]; // m*blockDim.x
|
||||
scalar_t *rho_buffer_sh = &my_smem_rc[m * blockDim.x]; // batchsize*m
|
||||
scalar_t *s_buffer_sh =
|
||||
&my_smem_rc[m * blockDim.x + m * batchsize]; // m*blockDim.x
|
||||
scalar_t *y_buffer_sh =
|
||||
&my_smem_rc[2 * m * blockDim.x + m * batchsize]; // m*blockDim.x
|
||||
|
||||
__shared__ psum_t
|
||||
data[32]; // temporary buffer needed for block-wide reduction
|
||||
__shared__ scalar_t
|
||||
result; // result of the reduction or vector-vector dot product
|
||||
int batch = blockIdx.x; // one block per batch
|
||||
if (threadIdx.x >= v_dim)
|
||||
return;
|
||||
|
||||
scalar_t gq;
|
||||
gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
|
||||
|
||||
////////////////////
|
||||
// update_buffer
|
||||
////////////////////
|
||||
scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
|
||||
// if y is close to zero
|
||||
scalar_t s =
|
||||
q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
|
||||
reduce_v1(y * s, v_dim, &data[0], &result);
|
||||
scalar_t numerator = result;
|
||||
// scalar_t rho = 1.0/numerator;
|
||||
|
||||
if (!rolled_ys) {
|
||||
#pragma unroll
|
||||
for (int i = 1; i < m; i++) {
|
||||
scalar_t st =
|
||||
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
scalar_t yt =
|
||||
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st;
|
||||
y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt;
|
||||
s_buffer_sh[m * threadIdx.x + i - 1] = st;
|
||||
y_buffer_sh[m * threadIdx.x + i - 1] = yt;
|
||||
}
|
||||
}
|
||||
|
||||
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
|
||||
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
|
||||
s_buffer_sh[m * threadIdx.x + m - 1] = s;
|
||||
y_buffer_sh[m * threadIdx.x + m - 1] = y;
|
||||
grad_0[batch * v_dim + threadIdx.x] = gq;
|
||||
x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
|
||||
if (threadIdx.x < m - 1) {
|
||||
// m thread participate to shif the values
|
||||
// this is safe as m<32 and this happens in lockstep
|
||||
scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch];
|
||||
rho_buffer[threadIdx.x * batchsize + batch] = rho;
|
||||
rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
|
||||
}
|
||||
if (threadIdx.x == m - 1) {
|
||||
scalar_t rho = 1.0 / numerator;
|
||||
// if this is nan, make it zero:
|
||||
if (stable_mode && numerator == 0.0) {
|
||||
rho = 0.0;
|
||||
}
|
||||
rho_buffer[threadIdx.x * batchsize + batch] = rho;
|
||||
rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
|
||||
}
|
||||
// return;
|
||||
__syncthreads();
|
||||
////////////////////
|
||||
// step
|
||||
////////////////////
|
||||
// scalar_t alpha_buffer[16];
|
||||
// assert(m<16); // allocating a buffer assuming m < 16
|
||||
|
||||
#pragma unroll
|
||||
for (int i = m - 1; i > -1; i--) {
|
||||
// reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
|
||||
// v_dim, &data[0], &result);
|
||||
reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
|
||||
alpha_buffer_sh[threadIdx.x * m + i] =
|
||||
result * rho_buffer_sh[i * batchsize + batch];
|
||||
// gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim +
|
||||
// batch*v_dim + threadIdx.x];
|
||||
gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
|
||||
y_buffer_sh[m * threadIdx.x + i];
|
||||
}
|
||||
|
||||
// compute var1
|
||||
reduce_v1(y * y, v_dim, data, &result);
|
||||
scalar_t denominator = result;
|
||||
// reduce(s*y, v_dim, data, &result); // redundant - already computed it above
|
||||
// scalar_t numerator = result;
|
||||
scalar_t var1 = numerator / denominator;
|
||||
|
||||
// To improve stability, uncomment below line: [this however leads to poor
|
||||
// convergence]
|
||||
|
||||
if (stable_mode && denominator == 0.0) {
|
||||
var1 = epsilon;
|
||||
}
|
||||
|
||||
scalar_t gamma = relu(var1);
|
||||
gq = gamma * gq;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < m; i++) {
|
||||
// reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
|
||||
// v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] -
|
||||
// result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim +
|
||||
// batch*v_dim + threadIdx.x];
|
||||
reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
|
||||
gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
|
||||
result * rho_buffer_sh[i * batchsize + batch]) *
|
||||
s_buffer_sh[m * threadIdx.x + i];
|
||||
}
|
||||
|
||||
step_vec[batch * v_dim + threadIdx.x] =
|
||||
-1.0 * gq; // copy from shared memory to global memory
|
||||
}
|
||||
|
||||
// (32/M) rolls per warp
|
||||
// Threads in a warp in a GPU execute in a lock-step. We leverage that to do
|
||||
// the roll without using temporary storage or explicit synchronization.
|
||||
template <typename scalar_t>
|
||||
__global__ void lbfgs_roll(scalar_t *a, // m x b x 175
|
||||
scalar_t *b, // m x b x 175
|
||||
const int m_t, const int batchsize, const int m,
|
||||
const int v_dim) {
|
||||
assert(m_t <= 32);
|
||||
int t = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (t >= m_t * v_dim * batchsize)
|
||||
return;
|
||||
int _m = t % m_t;
|
||||
int _v_dim = (t / m_t) % v_dim;
|
||||
int batch = t / (m * v_dim); // this line could be wrong?
|
||||
|
||||
if (_m < m - 1) {
|
||||
a[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
|
||||
a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
|
||||
b[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
|
||||
b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename psum_t, bool rolled_ys>
|
||||
__global__ void lbfgs_update_buffer_and_step(
|
||||
scalar_t *step_vec, // b x 175
|
||||
scalar_t *rho_buffer, // m x b x 1
|
||||
scalar_t *y_buffer, // m x b x 175
|
||||
scalar_t *s_buffer, // m x b x 175
|
||||
scalar_t *q, // b x 175
|
||||
scalar_t *x_0, // b x 175
|
||||
scalar_t *grad_0, // b x 175
|
||||
const scalar_t *grad_q, // b x 175
|
||||
const float epsilon, const int batchsize, const int m, const int v_dim,
|
||||
const bool stable_mode =
|
||||
false) // s_buffer and y_buffer are not rolled by default
|
||||
{
|
||||
// extern __shared__ scalar_t alpha_buffer_sh[];
|
||||
extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
|
||||
scalar_t *alpha_buffer_sh = reinterpret_cast<scalar_t *>(my_smem);
|
||||
|
||||
__shared__ psum_t
|
||||
data[32]; // temporary buffer needed for block-wide reduction
|
||||
__shared__ scalar_t
|
||||
result; // result of the reduction or vector-vector dot product
|
||||
int batch = blockIdx.x; // one block per batch
|
||||
if (threadIdx.x >= v_dim)
|
||||
return;
|
||||
|
||||
scalar_t gq;
|
||||
gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
|
||||
|
||||
////////////////////
|
||||
// update_buffer
|
||||
////////////////////
|
||||
scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
|
||||
// if y is close to zero
|
||||
scalar_t s =
|
||||
q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
|
||||
reduce(y * s, v_dim, &data[0], &result);
|
||||
scalar_t numerator = result;
|
||||
// scalar_t rho = 1.0/numerator;
|
||||
|
||||
if (!rolled_ys) {
|
||||
for (int i = 1; i < m; i++) {
|
||||
s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
|
||||
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
|
||||
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
|
||||
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
|
||||
grad_0[batch * v_dim + threadIdx.x] = gq;
|
||||
x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
|
||||
if (threadIdx.x < m - 1) {
|
||||
// m thread participate to shif the values
|
||||
// this is safe as m<32 and this happens in lockstep
|
||||
rho_buffer[threadIdx.x * batchsize + batch] =
|
||||
rho_buffer[(threadIdx.x + 1) * batchsize + batch];
|
||||
}
|
||||
if (threadIdx.x == m - 1) {
|
||||
scalar_t rho = 1.0 / numerator;
|
||||
// if this is nan, make it zero:
|
||||
if (stable_mode && numerator == 0.0) {
|
||||
rho = 0.0;
|
||||
}
|
||||
rho_buffer[threadIdx.x * batchsize + batch] = rho;
|
||||
}
|
||||
// return;
|
||||
//__syncthreads();
|
||||
////////////////////
|
||||
// step
|
||||
////////////////////
|
||||
// scalar_t alpha_buffer[16];
|
||||
// assert(m<16); // allocating a buffer assuming m < 16
|
||||
|
||||
#pragma unroll
|
||||
for (int i = m - 1; i > -1; i--) {
|
||||
reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
|
||||
v_dim, &data[0], &result);
|
||||
alpha_buffer_sh[threadIdx.x * m + i] =
|
||||
result * rho_buffer[i * batchsize + batch];
|
||||
gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
|
||||
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
}
|
||||
|
||||
// compute var1
|
||||
reduce(y * y, v_dim, data, &result);
|
||||
scalar_t denominator = result;
|
||||
// reduce(s*y, v_dim, data, &result); // redundant - already computed it above
|
||||
// scalar_t numerator = result;
|
||||
scalar_t var1 = numerator / denominator;
|
||||
|
||||
// To improve stability, uncomment below line: [this however leads to poor
|
||||
// convergence]
|
||||
|
||||
if (stable_mode && denominator == 0.0) {
|
||||
var1 = epsilon;
|
||||
}
|
||||
|
||||
scalar_t gamma = relu(var1);
|
||||
|
||||
gq = gamma * gq;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < m; i++) {
|
||||
reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
|
||||
v_dim, &data[0], &result);
|
||||
gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
|
||||
result * rho_buffer[i * batchsize + batch]) *
|
||||
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
|
||||
}
|
||||
|
||||
step_vec[batch * v_dim + threadIdx.x] =
|
||||
-1.0 * gq; // copy from shared memory to global memory
|
||||
}
|
||||
|
||||
} // namespace Optimization
|
||||
} // namespace Curobo
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
torch::Tensor y_buffer, torch::Tensor s_buffer,
|
||||
torch::Tensor grad_q, const float epsilon, const int batch_size,
|
||||
const int m, const int v_dim) {
|
||||
using namespace Curobo::Optimization;
|
||||
const int threadsPerBlock = 128;
|
||||
const int blocksPerGrid =
|
||||
((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
// launch threads per batch:
|
||||
// int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
|
||||
|
||||
// const int blocksPerGrid = batch_size; //((batch_size) + threadsPerBlock -
|
||||
// 1) / threadsPerBlock;
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
step_vec.scalar_type(), "lbfgs_step_cu", ([&] {
|
||||
lbfgs_step_kernel_old<scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock,
|
||||
v_dim * sizeof(step_vec.scalar_type()), stream>>>(
|
||||
step_vec.data_ptr<scalar_t>(), rho_buffer.data_ptr<scalar_t>(),
|
||||
y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
|
||||
grad_q.data_ptr<scalar_t>(), epsilon, batch_size, m, v_dim);
|
||||
}));
|
||||
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {step_vec};
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
|
||||
torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
|
||||
torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
|
||||
const int m, const int v_dim) {
|
||||
|
||||
using namespace Curobo::Optimization;
|
||||
|
||||
// const int threadsPerBlock = 128;
|
||||
// launch threads per batch:
|
||||
// int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
|
||||
int threadsPerBlock = v_dim;
|
||||
|
||||
const int blocksPerGrid =
|
||||
batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
y_buffer.scalar_type(), "lbfgs_update_cu", ([&] {
|
||||
lbfgs_update_buffer_kernel<scalar_t, scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock,
|
||||
v_dim * sizeof(y_buffer.scalar_type()), stream>>>(
|
||||
rho_buffer.data_ptr<scalar_t>(), y_buffer.data_ptr<scalar_t>(),
|
||||
s_buffer.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
|
||||
x_0.data_ptr<scalar_t>(), grad_0.data_ptr<scalar_t>(),
|
||||
grad_q.data_ptr<scalar_t>(), batch_size, m, v_dim);
|
||||
}));
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {rho_buffer, y_buffer, s_buffer, x_0, grad_0};
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
|
||||
torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
|
||||
const float epsilon, const int batch_size, const int m,
|
||||
const int v_dim, const bool stable_mode) {
|
||||
using namespace Curobo::Optimization;
|
||||
|
||||
// call first kernel:
|
||||
|
||||
int threadsPerBlock = v_dim;
|
||||
assert(threadsPerBlock < 1024);
|
||||
assert(m < M_MAX);
|
||||
int blocksPerGrid =
|
||||
batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
int smemsize = 0;
|
||||
if (true) {
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] {
|
||||
smemsize = m * threadsPerBlock * sizeof(scalar_t);
|
||||
lbfgs_update_buffer_and_step<scalar_t, scalar_t, false>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
|
||||
step_vec.data_ptr<scalar_t>(),
|
||||
rho_buffer.data_ptr<scalar_t>(),
|
||||
y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
|
||||
q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
|
||||
grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
|
||||
epsilon, batch_size, m, v_dim, stable_mode);
|
||||
});
|
||||
} else {
|
||||
|
||||
// v1 does not work
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] {
|
||||
smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) +
|
||||
m * batch_size * sizeof(scalar_t);
|
||||
lbfgs_update_buffer_and_step_v1<scalar_t, scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
|
||||
step_vec.data_ptr<scalar_t>(),
|
||||
rho_buffer.data_ptr<scalar_t>(),
|
||||
y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
|
||||
q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
|
||||
grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
|
||||
epsilon, batch_size, m, v_dim, false, stable_mode);
|
||||
});
|
||||
}
|
||||
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0};
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
|
||||
torch::Tensor rho_buffer,
|
||||
torch::Tensor sum, const int batch_size,
|
||||
const int m, const int v_dim) {
|
||||
|
||||
using namespace Curobo::Optimization;
|
||||
|
||||
int threadsPerBlock = pow(2, ((int)log2(v_dim)) + 1);
|
||||
int blocksPerGrid =
|
||||
batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
|
||||
// printf("threadsPerBlock:%d, blocksPerGrid: %d\n",
|
||||
// threadsPerBlock, blocksPerGrid);
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
vec.scalar_type(), "reduce_cu", ([&] {
|
||||
reduce_kernel<scalar_t, scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
vec.data_ptr<scalar_t>(), vec2.data_ptr<scalar_t>(),
|
||||
rho_buffer.data_ptr<scalar_t>(), sum.data_ptr<scalar_t>(),
|
||||
batch_size, m, v_dim);
|
||||
}));
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {sum, rho_buffer};
|
||||
}
|
||||
103
src/curobo/curobolib/cpp/line_search_cuda.cpp
Normal file
103
src/curobo/curobolib/cpp/line_search_cuda.cpp
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
// CUDA forward declarations
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
|
||||
torch::Tensor best_iteration,
|
||||
torch::Tensor current_iteration,
|
||||
const torch::Tensor cost,
|
||||
const torch::Tensor q, const int d_opt, const int cost_s1,
|
||||
const int cost_s2, const int iteration,
|
||||
const float delta_threshold,
|
||||
const float relative_threshold = 0.999);
|
||||
|
||||
std::vector<torch::Tensor> line_search_cuda(
|
||||
// torch::Tensor m,
|
||||
torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
|
||||
const torch::Tensor g_x, const torch::Tensor x_set,
|
||||
const torch::Tensor step_vec, const torch::Tensor c_0,
|
||||
const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
|
||||
const float c_2, const bool strong_wolfe, const bool approx_wolfe,
|
||||
const int l1, const int l2, const int batchsize);
|
||||
// C++ interface
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
|
||||
std::vector<torch::Tensor> line_search_call(
|
||||
// torch::Tensor m,
|
||||
torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
|
||||
const torch::Tensor g_x, const torch::Tensor x_set,
|
||||
const torch::Tensor step_vec, const torch::Tensor c_0,
|
||||
const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
|
||||
const float c_2, const bool strong_wolfe, const bool approx_wolfe,
|
||||
const int l1, const int l2, const int batchsize) {
|
||||
|
||||
CHECK_INPUT(g_x);
|
||||
CHECK_INPUT(x_set);
|
||||
CHECK_INPUT(step_vec);
|
||||
CHECK_INPUT(c_0);
|
||||
CHECK_INPUT(alpha_list);
|
||||
CHECK_INPUT(c_idx);
|
||||
// CHECK_INPUT(m);
|
||||
CHECK_INPUT(best_x);
|
||||
CHECK_INPUT(best_c);
|
||||
CHECK_INPUT(best_grad);
|
||||
const at::cuda::OptionalCUDAGuard guard(best_x.device());
|
||||
|
||||
// return line_search_cuda(m, g_x, step_vec, c_0, alpha_list, c_1, c_2,
|
||||
// strong_wolfe, l1, l2, batchsize);
|
||||
return line_search_cuda(best_x, best_c, best_grad, g_x, x_set, step_vec, c_0,
|
||||
alpha_list, c_idx, c_1, c_2, strong_wolfe,
|
||||
approx_wolfe, l1, l2, batchsize);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
|
||||
torch::Tensor best_iteration,
|
||||
torch::Tensor current_iteration,
|
||||
const torch::Tensor cost,
|
||||
const torch::Tensor q, const int d_opt, const int cost_s1,
|
||||
const int cost_s2, const int iteration,
|
||||
const float delta_threshold,
|
||||
const float relative_threshold=0.999) {
|
||||
|
||||
CHECK_INPUT(best_cost);
|
||||
CHECK_INPUT(best_q);
|
||||
CHECK_INPUT(cost);
|
||||
CHECK_INPUT(q);
|
||||
CHECK_INPUT(current_iteration);
|
||||
CHECK_INPUT(best_iteration);
|
||||
const at::cuda::OptionalCUDAGuard guard(cost.device());
|
||||
|
||||
return update_best_cuda(best_cost, best_q, best_iteration, current_iteration, cost, q, d_opt,
|
||||
cost_s1, cost_s2, iteration, delta_threshold, relative_threshold);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("update_best", &update_best_call, "Update Best (CUDA)");
|
||||
m.def("line_search", &line_search_call, "Line search (CUDA)");
|
||||
|
||||
}
|
||||
398
src/curobo/curobolib/cpp/line_search_kernel.cu
Normal file
398
src/curobo/curobolib/cpp/line_search_kernel.cu
Normal file
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <cuda.h>
|
||||
#include <torch/extension.h>
|
||||
#include <vector>
|
||||
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
// #include "helper_cuda.h"
|
||||
#include "helper_math.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <cub/cub.cuh>
|
||||
#include <math.h>
|
||||
|
||||
// #include <stdio.h>
|
||||
//
|
||||
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||
// #include <cuda_runtime.h>
|
||||
|
||||
// #include <cuda_fp16.h>
|
||||
// #include <helper_cuda.h>
|
||||
|
||||
#define FULL_MASK 0xffffffff
|
||||
|
||||
namespace Curobo {
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
|
||||
psum_t *data, scalar_t *result) {
|
||||
psum_t val = v;
|
||||
val += __shfl_down_sync(mask, val, 1);
|
||||
val += __shfl_down_sync(mask, val, 2);
|
||||
val += __shfl_down_sync(mask, val, 4);
|
||||
val += __shfl_down_sync(mask, val, 8);
|
||||
val += __shfl_down_sync(mask, val, 16);
|
||||
// int leader = __ffs(mask) – 1; // select a leader lane
|
||||
int leader = 0;
|
||||
if (threadIdx.x % 32 == leader) {
|
||||
if (m <= 32) {
|
||||
result[0] = (scalar_t)val;
|
||||
} else {
|
||||
data[(threadIdx.x + 1) / 32] = val;
|
||||
}
|
||||
}
|
||||
if (m > 32) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int elems = (m + 31) / 32;
|
||||
assert(elems <= 32);
|
||||
unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
|
||||
if (threadIdx.x < elems) { // only the first warp will do this work
|
||||
psum_t val2 = data[threadIdx.x % 32];
|
||||
int shift = 1;
|
||||
for (int i = elems - 1; i > 0; i /= 2) {
|
||||
val2 += __shfl_down_sync(mask2, val2, shift);
|
||||
shift *= 2;
|
||||
}
|
||||
// int leader = __ffs(mask2) – 1; // select a leader lane
|
||||
int leader = 0;
|
||||
if (threadIdx.x % 32 == leader) {
|
||||
result[0] = (scalar_t)val2;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Launched with l2 threads/block and batchsize blocks
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__global__ void line_search_kernel(
|
||||
// int64_t *m_idx, // 4x1x1
|
||||
scalar_t *best_x, // 4x280
|
||||
scalar_t *best_c, // 4x1
|
||||
scalar_t *best_grad, // 4x280
|
||||
const scalar_t *g_x, // 4x6x280
|
||||
const scalar_t *x_set, // 4x6x280
|
||||
const scalar_t *step_vec, // 4x280x1
|
||||
const scalar_t *c, // 4x6x1
|
||||
const scalar_t *alpha_list, // 4x6x1
|
||||
const int64_t *c_idx, // 4x1x1
|
||||
const float c_1, const float c_2, const bool strong_wolfe,
|
||||
const bool approx_wolfe,
|
||||
const int l1, // 6
|
||||
const int l2, // 280
|
||||
const int batchsize) // 4
|
||||
{
|
||||
|
||||
int batch = blockIdx.x;
|
||||
__shared__ psum_t data[32];
|
||||
__shared__ scalar_t result[32];
|
||||
assert(l1 <= 32);
|
||||
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
|
||||
|
||||
if (threadIdx.x >= l2) {
|
||||
return;
|
||||
}
|
||||
|
||||
scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
|
||||
|
||||
// g_step = g0 @ step_vec_T
|
||||
// g_x @ step_vec_T
|
||||
for (int i = 0; i < l1; i++) {
|
||||
reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
|
||||
&data[0], &result[i]);
|
||||
}
|
||||
|
||||
__shared__ scalar_t step_success[32];
|
||||
__shared__ scalar_t step_success_w1[32];
|
||||
assert(blockDim.x >= l1);
|
||||
bool wolfe_1 = false;
|
||||
bool wolfe = false;
|
||||
bool condition = threadIdx.x < l1;
|
||||
if (condition) {
|
||||
// scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
|
||||
scalar_t alpha_list_elem = alpha_list[threadIdx.x];
|
||||
|
||||
// condition 1:
|
||||
wolfe_1 = c[batch * l1 + threadIdx.x] <=
|
||||
(c[batch * l1] + c_1 * alpha_list_elem * result[0]);
|
||||
|
||||
// condition 2:
|
||||
bool wolfe_2;
|
||||
if (strong_wolfe) {
|
||||
wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
|
||||
} else {
|
||||
wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
|
||||
}
|
||||
|
||||
wolfe = wolfe_1 & wolfe_2;
|
||||
|
||||
step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
|
||||
step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
__shared__ int idx_shared;
|
||||
if (threadIdx.x == 0) {
|
||||
int m_id = 0;
|
||||
int m1_id = 0;
|
||||
scalar_t max1 = step_success[0];
|
||||
scalar_t max2 = step_success_w1[0];
|
||||
for (int i = 1; i < l1; i++) {
|
||||
if (max1 < step_success[i]) {
|
||||
max1 = step_success[i];
|
||||
m_id = i;
|
||||
}
|
||||
if (max2 < step_success_w1[i]) {
|
||||
max2 = step_success_w1[i];
|
||||
m1_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (!approx_wolfe) {
|
||||
|
||||
// m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
|
||||
if (m_id == 0) {
|
||||
m_id = m1_id;
|
||||
}
|
||||
// m_idx[m_idx == 0] = 1
|
||||
if (m_id == 0) {
|
||||
m_id = 1;
|
||||
}
|
||||
}
|
||||
idx_shared = m_id + c_idx[batch];
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// write outputs using the computed index.
|
||||
// one index per batch is computed
|
||||
////////////////////////////////////
|
||||
// l2 is d_opt, l1 is line_search n.
|
||||
// idx_shared contains index in l1
|
||||
//
|
||||
__syncthreads();
|
||||
if (threadIdx.x < l2) {
|
||||
if (threadIdx.x == 0) {
|
||||
// printf("block: %d, idx_shared: %d\n", batch, idx_shared);
|
||||
}
|
||||
best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
|
||||
best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
best_c[batch] = c[idx_shared];
|
||||
}
|
||||
}
|
||||
|
||||
// Launched with l2 threads/block and #blocks = batchsize
|
||||
template <typename scalar_t, typename psum_t>
|
||||
__global__ void line_search_kernel_mask(
|
||||
// int64_t *m_idx, // 4x1x1
|
||||
scalar_t *best_x, // 4x280
|
||||
scalar_t *best_c, // 4x1
|
||||
scalar_t *best_grad, // 4x280
|
||||
const scalar_t *g_x, // 4x6x280
|
||||
const scalar_t *x_set, // 4x6x280
|
||||
const scalar_t *step_vec, // 4x280x1
|
||||
const scalar_t *c, // 4x6x1
|
||||
const scalar_t *alpha_list, // 4x6x1
|
||||
const int64_t *c_idx, // 4x1x1
|
||||
const float c_1, const float c_2, const bool strong_wolfe,
|
||||
const bool approx_wolfe,
|
||||
const int l1, // 6
|
||||
const int l2, // 280
|
||||
const int batchsize) // 4
|
||||
{
|
||||
|
||||
int batch = blockIdx.x;
|
||||
__shared__ psum_t data[32];
|
||||
__shared__ scalar_t result[32];
|
||||
assert(l1 <= 32);
|
||||
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
|
||||
|
||||
if (threadIdx.x >= l2) {
|
||||
return;
|
||||
}
|
||||
|
||||
scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
|
||||
|
||||
// g_step = g0 @ step_vec_T
|
||||
// g_x @ step_vec_T
|
||||
for (int i = 0; i < l1; i++) {
|
||||
reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
|
||||
&data[0], &result[i]);
|
||||
}
|
||||
|
||||
// __shared__ scalar_t step_success[32];
|
||||
// __shared__ scalar_t step_success_w1[32];
|
||||
assert(blockDim.x >= l1);
|
||||
bool wolfe_1 = false;
|
||||
bool wolfe = false;
|
||||
bool condition = threadIdx.x < l1;
|
||||
if (condition) {
|
||||
scalar_t alpha_list_elem = alpha_list[threadIdx.x];
|
||||
|
||||
// scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
|
||||
|
||||
// condition 1:
|
||||
wolfe_1 = c[batch * l1 + threadIdx.x] <=
|
||||
(c[batch * l1] + c_1 * alpha_list_elem * result[0]);
|
||||
|
||||
// condition 2:
|
||||
bool wolfe_2;
|
||||
if (strong_wolfe) {
|
||||
wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
|
||||
} else {
|
||||
wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
|
||||
}
|
||||
|
||||
// wolfe = torch.logical_and(wolfe_1, wolfe_2)
|
||||
wolfe = wolfe_1 & wolfe_2;
|
||||
|
||||
// // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
|
||||
// // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
|
||||
// step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
|
||||
// step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
|
||||
}
|
||||
unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
|
||||
unsigned msk = __ballot_sync(FULL_MASK, wolfe & condition);
|
||||
|
||||
// get the index of the last occurance of true
|
||||
unsigned msk1_brev = __brev(msk1);
|
||||
unsigned msk_brev = __brev(msk);
|
||||
|
||||
int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
|
||||
int id = 32 - __ffs(msk_brev); // position of least signficant bit set to 1
|
||||
|
||||
__syncthreads();
|
||||
|
||||
__shared__ int idx_shared;
|
||||
if (threadIdx.x == 0) {
|
||||
if (!approx_wolfe) {
|
||||
if (id == 32) { // msk is zero
|
||||
id = id1;
|
||||
}
|
||||
if (id == 0) { // bit 0 is set
|
||||
id = id1;
|
||||
}
|
||||
if (id == 32) { // msk is zero
|
||||
id = 1;
|
||||
}
|
||||
if (id == 0) {
|
||||
id = 1;
|
||||
}
|
||||
} else {
|
||||
if (id == 32) { // msk is zero
|
||||
id = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// // _, m_idx = torch.max(step_success, dim=-2)
|
||||
// // _, m1_idx = torch.max(step_success_w1, dim=-2)
|
||||
// int m_id = 0;
|
||||
// int m1_id = 0;
|
||||
// scalar_t max1 = step_success[0];
|
||||
// scalar_t max2 = step_success_w1[0];
|
||||
// for (int i=1; i<l1; i++) {
|
||||
// if (max1<step_success[i]) {
|
||||
// max1 = step_success[i];
|
||||
// m_id = i;
|
||||
// }
|
||||
// if (max2<step_success_w1[i]) {
|
||||
// max2 = step_success_w1[i];
|
||||
// m1_id = i;
|
||||
// }
|
||||
// }
|
||||
|
||||
// // m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
|
||||
// if (m_id == 0) {
|
||||
// m_id = m1_id;
|
||||
// }
|
||||
|
||||
// // m_idx[m_idx == 0] = 1
|
||||
// if (m_id == 0) {
|
||||
// m_id = 1;
|
||||
// }
|
||||
|
||||
// if (id != m_id) {
|
||||
// printf("id=%d, m_id=%d\n", id, m_id);
|
||||
// printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
|
||||
// 32-__ffs(msk1_brev), 32-__ffs(msk_brev));
|
||||
// }
|
||||
|
||||
// m_idx[batch] = m_id;
|
||||
// m_idx[batch] = id;
|
||||
idx_shared = id + c_idx[batch];
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// write outputs using the computed index.
|
||||
// one index per batch is computed
|
||||
////////////////////////////////////
|
||||
__syncthreads();
|
||||
if (threadIdx.x < l2) {
|
||||
if (threadIdx.x == 0) {
|
||||
// printf("block: %d, idx_shared: %d\n", batch, idx_shared);
|
||||
}
|
||||
best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
|
||||
best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
best_c[batch] = c[idx_shared];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Optimization
|
||||
} // namespace Curobo
|
||||
std::vector<torch::Tensor> line_search_cuda(
|
||||
// torch::Tensor m_idx,
|
||||
torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
|
||||
const torch::Tensor g_x, const torch::Tensor x_set,
|
||||
const torch::Tensor step_vec, const torch::Tensor c_0,
|
||||
const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
|
||||
const float c_2, const bool strong_wolfe, const bool approx_wolfe,
|
||||
const int l1, const int l2, const int batchsize) {
|
||||
using namespace Curobo::Optimization;
|
||||
assert(l2 <= 1024);
|
||||
|
||||
// multiple of 32
|
||||
const int threadsPerBlock = 32 * ((l2 + 31) / 32); // l2;
|
||||
const int blocksPerGrid = batchsize;
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
g_x.scalar_type(), "line_search_cu", ([&] {
|
||||
line_search_kernel_mask<scalar_t, scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
// m_idx.data_ptr<int>(),
|
||||
best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
|
||||
best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
|
||||
x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
|
||||
c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
|
||||
c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
|
||||
l1, l2, batchsize);
|
||||
}));
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {best_x, best_c, best_grad};
|
||||
}
|
||||
790
src/curobo/curobolib/cpp/pose_distance_kernel.cu
Normal file
790
src/curobo/curobolib/cpp/pose_distance_kernel.cu
Normal file
@@ -0,0 +1,790 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include "helper_math.h"
|
||||
#include <cmath>
|
||||
#include <cuda_fp16.h>
|
||||
#include <vector>
|
||||
|
||||
#define SINGLE_GOAL 0
|
||||
#define BATCH_GOAL 1
|
||||
#define GOALSET 2
|
||||
#define BATCH_GOALSET 3
|
||||
|
||||
namespace Curobo
|
||||
{
|
||||
namespace Pose
|
||||
{
|
||||
__device__ __forceinline__ void
|
||||
transform_vec_quat(const float4 q, // x,y,z, qw, qx,qy,qz
|
||||
const float *d_vec_weight,
|
||||
float *C) {
|
||||
// do dot product:
|
||||
// new_p = q * p * q_inv + obs_p
|
||||
|
||||
if (false || q.x != 0 || q.y != 0 || q.z != 0) {
|
||||
|
||||
C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
|
||||
2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
|
||||
2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
|
||||
q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
|
||||
C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
|
||||
2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
|
||||
q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
|
||||
2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
|
||||
C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
|
||||
q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
|
||||
q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
|
||||
q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
|
||||
|
||||
C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
|
||||
2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
|
||||
2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
|
||||
q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
|
||||
C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] +
|
||||
2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
|
||||
q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
|
||||
2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
|
||||
C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
|
||||
q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
|
||||
q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
|
||||
q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
|
||||
}
|
||||
{
|
||||
C[0] = d_vec_weight[0];
|
||||
C[1] = d_vec_weight[1];
|
||||
C[2] = d_vec_weight[2];
|
||||
C[3] = d_vec_weight[3];
|
||||
C[4] = d_vec_weight[4];
|
||||
C[5] = d_vec_weight[5];
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
inv_transform_vec_quat(const float4 q_in, // x,y,z, qw, qx,qy,qz
|
||||
const float *d_vec_weight,
|
||||
float *C) {
|
||||
// do dot product:
|
||||
// new_p = q * p * q_inv + obs_p
|
||||
|
||||
if (q_in.x != 0 || q_in.y != 0 || q_in.z != 0) {
|
||||
float4 q = make_float4(-q_in.x, -q_in.y, -q_in.z, q_in.w);
|
||||
|
||||
C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
|
||||
2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
|
||||
2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
|
||||
q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
|
||||
C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
|
||||
2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
|
||||
q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
|
||||
2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
|
||||
C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
|
||||
q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
|
||||
q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
|
||||
q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
|
||||
|
||||
C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
|
||||
2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
|
||||
2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
|
||||
q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
|
||||
C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] +
|
||||
2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
|
||||
q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
|
||||
2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
|
||||
C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
|
||||
q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
|
||||
q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
|
||||
q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
|
||||
}
|
||||
{
|
||||
C[0] = d_vec_weight[0];
|
||||
C[1] = d_vec_weight[1];
|
||||
C[2] = d_vec_weight[2];
|
||||
C[3] = d_vec_weight[3];
|
||||
C[4] = d_vec_weight[4];
|
||||
C[5] = d_vec_weight[5];
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void
|
||||
compute_quat_distance(float *result, const float4 a, const float4 b) {
|
||||
// We compute distance with the conjugate of b
|
||||
float r_w = (a.w * b.w + a.x * b.x + a.y * b.y + a.z * b.z);
|
||||
if (r_w < 0.0) {
|
||||
r_w = 1.0;
|
||||
} else {
|
||||
r_w = -1.0;
|
||||
}
|
||||
|
||||
result[0] = r_w * (-1 * a.w * b.x + b.w * a.x - a.y * b.z + b.y * a.z);
|
||||
result[1] = r_w * (-1 * a.w * b.y + b.w * a.y - a.z * b.x + b.z * a.x);
|
||||
result[2] = r_w * (-1 * a.w * b.z + b.w * a.z - a.x * b.y + b.x * a.y);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
compute_distance(float *distance_vec, float &distance, float &position_distance,
|
||||
float &rotation_distance, const float3 current_position,
|
||||
const float3 goal_position, const float4 current_quat,
|
||||
const float4 goal_quat, const float *vec_weight,
|
||||
const float *vec_convergence, const float position_weight,
|
||||
const float rotation_weight) {
|
||||
compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
|
||||
// compute position distance
|
||||
*(float3 *)&distance_vec[0] = current_position - goal_position;
|
||||
// distance_vec[0] = goal_position
|
||||
position_distance = 0;
|
||||
rotation_distance = 0;
|
||||
// scale by vec weight and position weight:
|
||||
#pragma unroll 3
|
||||
for (int i = 0; i < 3; i++) {
|
||||
distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
|
||||
position_distance += distance_vec[i] * distance_vec[i];
|
||||
}
|
||||
#pragma unroll 3
|
||||
for (int i = 3; i < 6; i++) {
|
||||
distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
|
||||
rotation_distance += distance_vec[i] * distance_vec[i];
|
||||
}
|
||||
|
||||
distance = 0;
|
||||
if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
|
||||
rotation_distance = sqrtf(rotation_distance);
|
||||
distance += rotation_weight * rotation_distance;
|
||||
}
|
||||
if (position_distance > vec_convergence[1] * vec_convergence[1]) {
|
||||
position_distance = sqrtf(position_distance);
|
||||
distance += position_weight * position_distance;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void compute_metric_distance(
|
||||
float *distance_vec, float &distance, float &position_distance,
|
||||
float &rotation_distance, const float3 current_position,
|
||||
const float3 goal_position, const float4 current_quat,
|
||||
const float4 goal_quat, const float *vec_weight,
|
||||
const float *vec_convergence, const float position_weight,
|
||||
const float p_alpha, const float rotation_weight, const float r_alpha) {
|
||||
compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
|
||||
// compute position distance
|
||||
*(float3 *)&distance_vec[0] = current_position - goal_position;
|
||||
// distance_vec[0] = goal_position
|
||||
position_distance = 0;
|
||||
rotation_distance = 0;
|
||||
// scale by vec weight and position weight:
|
||||
#pragma unroll 3
|
||||
for (int i = 0; i < 3; i++) {
|
||||
distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
|
||||
// position_distance += powf(distance_vec[i],2);
|
||||
position_distance += distance_vec[i] * distance_vec[i];
|
||||
}
|
||||
#pragma unroll 3
|
||||
for (int i = 3; i < 6; i++) {
|
||||
distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
|
||||
// rotation_distance += powf(distance_vec[i],2);
|
||||
rotation_distance += distance_vec[i] * distance_vec[i];
|
||||
}
|
||||
|
||||
distance = 0;
|
||||
if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
|
||||
rotation_distance = sqrtf(rotation_distance);
|
||||
distance += rotation_weight * log2f(coshf(r_alpha * rotation_distance));
|
||||
}
|
||||
if (position_distance > vec_convergence[1] * vec_convergence[1]) {
|
||||
position_distance = sqrtf(position_distance);
|
||||
distance += position_weight * log2f(coshf(p_alpha * position_distance));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void
|
||||
backward_pose_distance_kernel(scalar_t *out_grad_p, // [b,3]
|
||||
scalar_t *out_grad_q, // [b,4]
|
||||
const scalar_t *grad_distance, // [b,1]
|
||||
const scalar_t *grad_p_distance, // [b,1]
|
||||
const scalar_t *grad_q_distance, // [b,1]
|
||||
const scalar_t *pose_weight, // [2]
|
||||
const scalar_t *grad_p_vec, // [b,3]
|
||||
const scalar_t *grad_q_vec, // [b,4]
|
||||
const int batch_size) {
|
||||
const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (batch_idx >= batch_size) {
|
||||
return;
|
||||
}
|
||||
// read data
|
||||
const float g_distance = grad_distance[batch_idx];
|
||||
const float2 p_weight = *(float2 *)&pose_weight[0];
|
||||
const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
|
||||
const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
|
||||
const float g_p_distance = grad_p_distance[batch_idx];
|
||||
const float g_q_distance = grad_q_distance[batch_idx];
|
||||
|
||||
// compute position gradient
|
||||
float3 g_p =
|
||||
(g_p_v) * ((g_p_distance + g_distance * p_weight.y)); // scalar * float3
|
||||
float3 g_q =
|
||||
(g_q_v) * ((g_q_distance + g_distance * p_weight.x)); // scalar * float3
|
||||
|
||||
// write out
|
||||
*(float3 *)&out_grad_p[batch_idx * 3] = g_p;
|
||||
*(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void backward_pose_kernel(scalar_t *out_grad_p, // [b,3]
|
||||
scalar_t *out_grad_q, // [b,4]
|
||||
const scalar_t *grad_distance, // [b,1]
|
||||
const scalar_t *pose_weight, // [2]
|
||||
const scalar_t *grad_p_vec, // [b,3]
|
||||
const scalar_t *grad_q_vec, // [b,4]
|
||||
const int batch_size) {
|
||||
const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (batch_idx >= batch_size) {
|
||||
return;
|
||||
}
|
||||
// read data
|
||||
const float g_distance = grad_distance[batch_idx];
|
||||
const float2 p_weight = *(float2 *)&pose_weight[0];
|
||||
const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
|
||||
const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
|
||||
|
||||
// compute position gradient
|
||||
float3 g_p = (g_p_v) * ((g_distance * p_weight.y)); // scalar * float3
|
||||
float3 g_q = (g_q_v) * ((g_distance * p_weight.x)); // scalar * float3
|
||||
|
||||
// write out
|
||||
*(float3 *)&out_grad_p[batch_idx * 3] = g_p;
|
||||
*(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool write_distance>
|
||||
__global__ void goalset_pose_distance_kernel(
|
||||
scalar_t *out_distance, scalar_t *out_position_distance,
|
||||
scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
|
||||
int32_t *out_gidx, const scalar_t *current_position,
|
||||
const scalar_t *goal_position, const scalar_t *current_quat,
|
||||
const scalar_t *goal_quat, const scalar_t *vec_weight,
|
||||
const scalar_t *weight, const scalar_t *vec_convergence,
|
||||
const scalar_t *run_weight, const scalar_t *run_vec_weight,
|
||||
const int32_t *batch_pose_idx, const int mode, const int num_goals,
|
||||
const int batch_size, const int horizon, const bool write_grad = false) {
|
||||
const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
const int batch_idx = t_idx / horizon;
|
||||
const int h_idx = t_idx - (batch_idx * horizon);
|
||||
if (batch_idx >= batch_size || h_idx >= horizon) {
|
||||
return;
|
||||
}
|
||||
|
||||
// read current pose:
|
||||
float3 position =
|
||||
*(float3 *)¤t_position[batch_idx * horizon * 3 + h_idx * 3];
|
||||
float4 quat_4 = *(float4 *)¤t_quat[batch_idx * horizon * 4 + h_idx * 4];
|
||||
float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
|
||||
|
||||
// read weights:
|
||||
|
||||
float position_weight = weight[1];
|
||||
float rotation_weight = weight[0];
|
||||
if (!write_distance) {
|
||||
position_weight *= run_weight[h_idx];
|
||||
rotation_weight *= run_weight[h_idx];
|
||||
if (position_weight == 0.0 && rotation_weight == 0.0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
float3 l_goal_position;
|
||||
float4 l_goal_quat;
|
||||
float distance_vec[6]; // = {0.0};
|
||||
float pose_distance = 0.0;
|
||||
float position_distance = 0.0;
|
||||
float rotation_distance = 0.0;
|
||||
float best_distance = INFINITY;
|
||||
float best_position_distance = 0.0;
|
||||
float best_rotation_distance = 0.0;
|
||||
float best_distance_vec[6] = {0.0};
|
||||
float best_des_vec_weight[6] = {0.0};
|
||||
float d_vec_convergence[2];
|
||||
|
||||
*(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
|
||||
|
||||
int best_idx = -1;
|
||||
float d_vec_weight[6];
|
||||
float des_vec_weight[6] = {0.0};
|
||||
*(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
|
||||
*(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
|
||||
if (h_idx < horizon - 1) {
|
||||
*(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
|
||||
*(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
|
||||
}
|
||||
|
||||
|
||||
// read offset
|
||||
int offset = batch_pose_idx[batch_idx];
|
||||
if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
|
||||
offset = (offset)*num_goals;
|
||||
}
|
||||
|
||||
for (int k = 0; k < num_goals; k++) {
|
||||
|
||||
l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
|
||||
float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
|
||||
l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
|
||||
transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
|
||||
|
||||
compute_distance(&distance_vec[0], pose_distance, position_distance,
|
||||
rotation_distance, position, l_goal_position, quat,
|
||||
l_goal_quat,
|
||||
&des_vec_weight[0], //&l_vec_weight[0],
|
||||
&d_vec_convergence[0], //&l_vec_convergence[0],
|
||||
position_weight, rotation_weight);
|
||||
if (pose_distance <= best_distance) {
|
||||
best_idx = k;
|
||||
best_distance = pose_distance;
|
||||
best_position_distance = position_distance;
|
||||
best_rotation_distance = rotation_distance;
|
||||
if (write_grad) {
|
||||
//inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
|
||||
#pragma unroll 6
|
||||
for (int i = 0; i < 6; i++) {
|
||||
best_distance_vec[i] = distance_vec[i];
|
||||
best_des_vec_weight[i] = des_vec_weight[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// write out:
|
||||
|
||||
// write out pose distance:
|
||||
out_distance[batch_idx * horizon + h_idx] = best_distance;
|
||||
if (write_distance) {
|
||||
out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
|
||||
out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
|
||||
}
|
||||
out_gidx[batch_idx * horizon + h_idx] = best_idx;
|
||||
|
||||
if (write_grad) {
|
||||
if (write_distance) {
|
||||
position_weight = 1;
|
||||
rotation_weight = 1;
|
||||
}
|
||||
if (best_position_distance > 0) {
|
||||
best_position_distance = (position_weight / best_position_distance);
|
||||
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
|
||||
best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
|
||||
best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
|
||||
best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
|
||||
|
||||
} else {
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
|
||||
}
|
||||
|
||||
if (best_rotation_distance > 0) {
|
||||
best_rotation_distance = rotation_weight / best_rotation_distance;
|
||||
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
|
||||
best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
|
||||
best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
|
||||
best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
|
||||
} else {
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool write_distance>
|
||||
__global__ void goalset_pose_distance_metric_kernel(
|
||||
scalar_t *out_distance, scalar_t *out_position_distance,
|
||||
scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
|
||||
int32_t *out_gidx, const scalar_t *current_position,
|
||||
const scalar_t *goal_position, const scalar_t *current_quat,
|
||||
const scalar_t *goal_quat, const scalar_t *vec_weight,
|
||||
const scalar_t *weight, const scalar_t *vec_convergence,
|
||||
const scalar_t *run_weight, const scalar_t *run_vec_weight,
|
||||
const int32_t *batch_pose_idx, const int mode, const int num_goals,
|
||||
const int batch_size, const int horizon, const bool write_grad = false) {
|
||||
const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
const int batch_idx = t_idx / horizon;
|
||||
const int h_idx = t_idx - (batch_idx * horizon);
|
||||
if (batch_idx >= batch_size || h_idx >= horizon) {
|
||||
return;
|
||||
}
|
||||
|
||||
// read current pose:
|
||||
float3 position =
|
||||
*(float3 *)¤t_position[batch_idx * horizon * 3 + h_idx * 3];
|
||||
float4 quat_4 = *(float4 *)¤t_quat[batch_idx * horizon * 4 + h_idx * 4];
|
||||
float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
|
||||
|
||||
// read weights:
|
||||
|
||||
float position_weight = weight[1];
|
||||
float p_w_alpha = weight[3];
|
||||
float r_w_alpha = weight[2];
|
||||
float rotation_weight = weight[0];
|
||||
if (!write_distance) {
|
||||
position_weight *= run_weight[h_idx];
|
||||
rotation_weight *= run_weight[h_idx];
|
||||
p_w_alpha *= run_weight[h_idx];
|
||||
r_w_alpha *= run_weight[h_idx];
|
||||
if (position_weight == 0.0 && rotation_weight == 0.0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
float d_vec_convergence[2];
|
||||
*(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
|
||||
|
||||
float d_vec_weight[6];
|
||||
*(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
|
||||
*(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
|
||||
if (h_idx < horizon - 1) {
|
||||
*(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
|
||||
*(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
|
||||
}
|
||||
|
||||
float des_vec_weight[6] = {0.0};
|
||||
float3 l_goal_position;
|
||||
float4 l_goal_quat;
|
||||
float distance_vec[6]; // = {0.0};
|
||||
float pose_distance = 0.0;
|
||||
float position_distance = 0.0;
|
||||
float rotation_distance = 0.0;
|
||||
float best_distance = INFINITY;
|
||||
float best_position_distance = 0.0;
|
||||
float best_rotation_distance = 0.0;
|
||||
float best_distance_vec[6] = {0.0};
|
||||
float best_des_vec_weight[6] = {0.0};
|
||||
int best_idx = -1.0;
|
||||
int offset = batch_pose_idx[batch_idx];
|
||||
if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
|
||||
offset = (offset)*num_goals;
|
||||
}
|
||||
|
||||
for (int k = 0; k < num_goals; k++) {
|
||||
|
||||
l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
|
||||
float4 gq4 =
|
||||
*(float4 *)&goal_quat[(offset + k) * 4]; // reading qw, qx, qy, qz
|
||||
l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
|
||||
|
||||
transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
|
||||
compute_metric_distance(
|
||||
&distance_vec[0], pose_distance, position_distance, rotation_distance,
|
||||
position, l_goal_position, quat, l_goal_quat,
|
||||
&des_vec_weight[0], //&l_vec_weight[0],
|
||||
&d_vec_convergence[0], //&l_vec_convergence[0],
|
||||
position_weight, p_w_alpha, rotation_weight, r_w_alpha);
|
||||
if (pose_distance <= best_distance) {
|
||||
best_idx = k;
|
||||
best_distance = pose_distance;
|
||||
best_position_distance = position_distance;
|
||||
best_rotation_distance = rotation_distance;
|
||||
if (write_grad) {
|
||||
// inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
|
||||
|
||||
#pragma unroll 6
|
||||
for (int i = 0; i < 6; i++) {
|
||||
best_distance_vec[i] = distance_vec[i];
|
||||
best_des_vec_weight[i] = des_vec_weight[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add scaling metric:
|
||||
// best_distance = log2f(acoshf(best_distance));
|
||||
// write out:
|
||||
|
||||
// write out pose distance:
|
||||
out_distance[batch_idx * horizon + h_idx] = best_distance;
|
||||
if (write_distance) {
|
||||
out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
|
||||
out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
|
||||
}
|
||||
out_gidx[batch_idx * horizon + h_idx] = best_idx;
|
||||
|
||||
if (write_grad) {
|
||||
// write gradient
|
||||
// compute scalar term:
|
||||
// -w * (d_vec)/ (length * length(negative +1) * acos(length)
|
||||
// -w * (d_vec) * sinh(length) / (length * cosh(length))
|
||||
// compute extra term:
|
||||
|
||||
if (write_distance) {
|
||||
position_weight = 1.0;
|
||||
rotation_weight = 1.0;
|
||||
}
|
||||
|
||||
if (best_position_distance > 0) {
|
||||
best_position_distance =
|
||||
(p_w_alpha * position_weight *
|
||||
sinhf(p_w_alpha * best_position_distance)) /
|
||||
(best_position_distance * coshf(p_w_alpha * best_position_distance));
|
||||
|
||||
// best_position_distance = (position_weight/best_position_distance);
|
||||
// comput scalar gradient
|
||||
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
|
||||
best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
|
||||
best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
|
||||
best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
|
||||
|
||||
} else {
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
|
||||
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
|
||||
}
|
||||
|
||||
if (best_rotation_distance > 0) {
|
||||
best_rotation_distance =
|
||||
(r_w_alpha * rotation_weight *
|
||||
sinhf(r_w_alpha * best_rotation_distance)) /
|
||||
(best_rotation_distance * coshf(r_w_alpha * best_rotation_distance));
|
||||
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
|
||||
best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
|
||||
best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
|
||||
best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
|
||||
} else {
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
|
||||
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
|
||||
torch::Tensor out_rotation_distance,
|
||||
torch::Tensor distance_p_vector, // batch size, 3
|
||||
torch::Tensor distance_q_vector, // batch size, 4
|
||||
torch::Tensor out_gidx,
|
||||
const torch::Tensor current_position, // batch_size, 3
|
||||
const torch::Tensor goal_position, // n_boxes, 3
|
||||
const torch::Tensor current_quat, const torch::Tensor goal_quat,
|
||||
const torch::Tensor vec_weight, // n_boxes, 4, 4
|
||||
const torch::Tensor weight, const torch::Tensor vec_convergence,
|
||||
const torch::Tensor run_weight,
|
||||
const torch::Tensor run_vec_weight,
|
||||
const torch::Tensor batch_pose_idx, // batch_size, 1
|
||||
const int batch_size, const int horizon, const int mode,
|
||||
const int num_goals = 1, const bool compute_grad = false,
|
||||
const bool write_distance = true, const bool use_metric = false) {
|
||||
using namespace Curobo::Pose;
|
||||
// we compute the warp threads based on number of boxes:
|
||||
assert(batch_pose_idx.size(0) == batch_size);
|
||||
// TODO: verify this math
|
||||
// const int batch_size = out_distance.size(0);
|
||||
assert(run_weight.size(-1) == horizon);
|
||||
const int bh = batch_size * horizon;
|
||||
int threadsPerBlock = bh;
|
||||
if (bh > 128) {
|
||||
threadsPerBlock = 128;
|
||||
}
|
||||
|
||||
// we fit warp thread spheres in a threadsPerBlock
|
||||
|
||||
int blocksPerGrid = (bh + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
if (use_metric) {
|
||||
if (write_distance)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
current_position.scalar_type(), "batch_pose_distance", ([&] {
|
||||
goalset_pose_distance_metric_kernel
|
||||
<scalar_t, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_position_distance.data_ptr<scalar_t>(),
|
||||
out_rotation_distance.data_ptr<scalar_t>(),
|
||||
distance_p_vector.data_ptr<scalar_t>(),
|
||||
distance_q_vector.data_ptr<scalar_t>(),
|
||||
out_gidx.data_ptr<int32_t>(),
|
||||
current_position.data_ptr<scalar_t>(),
|
||||
goal_position.data_ptr<scalar_t>(),
|
||||
current_quat.data_ptr<scalar_t>(),
|
||||
goal_quat.data_ptr<scalar_t>(),
|
||||
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
|
||||
vec_convergence.data_ptr<scalar_t>(),
|
||||
run_weight.data_ptr<scalar_t>(),
|
||||
run_vec_weight.data_ptr<scalar_t>(),
|
||||
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
|
||||
batch_size, horizon, compute_grad);
|
||||
}));
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
current_position.scalar_type(), "batch_pose_distance", ([&] {
|
||||
goalset_pose_distance_metric_kernel
|
||||
// goalset_pose_distance_kernel
|
||||
<scalar_t, false><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_position_distance.data_ptr<scalar_t>(),
|
||||
out_rotation_distance.data_ptr<scalar_t>(),
|
||||
distance_p_vector.data_ptr<scalar_t>(),
|
||||
distance_q_vector.data_ptr<scalar_t>(),
|
||||
out_gidx.data_ptr<int32_t>(),
|
||||
current_position.data_ptr<scalar_t>(),
|
||||
goal_position.data_ptr<scalar_t>(),
|
||||
current_quat.data_ptr<scalar_t>(),
|
||||
goal_quat.data_ptr<scalar_t>(),
|
||||
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
|
||||
vec_convergence.data_ptr<scalar_t>(),
|
||||
run_weight.data_ptr<scalar_t>(),
|
||||
run_vec_weight.data_ptr<scalar_t>(),
|
||||
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
|
||||
batch_size, horizon, compute_grad);
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
if(write_distance)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
current_position.scalar_type(), "batch_pose_distance", ([&] {
|
||||
// goalset_pose_distance_metric_kernel
|
||||
goalset_pose_distance_kernel<scalar_t, true>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_position_distance.data_ptr<scalar_t>(),
|
||||
out_rotation_distance.data_ptr<scalar_t>(),
|
||||
distance_p_vector.data_ptr<scalar_t>(),
|
||||
distance_q_vector.data_ptr<scalar_t>(),
|
||||
out_gidx.data_ptr<int32_t>(),
|
||||
current_position.data_ptr<scalar_t>(),
|
||||
goal_position.data_ptr<scalar_t>(),
|
||||
current_quat.data_ptr<scalar_t>(),
|
||||
goal_quat.data_ptr<scalar_t>(),
|
||||
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
|
||||
vec_convergence.data_ptr<scalar_t>(),
|
||||
run_weight.data_ptr<scalar_t>(),
|
||||
run_vec_weight.data_ptr<scalar_t>(),
|
||||
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
|
||||
batch_size, horizon, compute_grad);
|
||||
}));
|
||||
}
|
||||
else
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
current_position.scalar_type(), "batch_pose_distance", ([&] {
|
||||
// goalset_pose_distance_metric_kernel
|
||||
goalset_pose_distance_kernel<scalar_t, false>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_position_distance.data_ptr<scalar_t>(),
|
||||
out_rotation_distance.data_ptr<scalar_t>(),
|
||||
distance_p_vector.data_ptr<scalar_t>(),
|
||||
distance_q_vector.data_ptr<scalar_t>(),
|
||||
out_gidx.data_ptr<int32_t>(),
|
||||
current_position.data_ptr<scalar_t>(),
|
||||
goal_position.data_ptr<scalar_t>(),
|
||||
current_quat.data_ptr<scalar_t>(),
|
||||
goal_quat.data_ptr<scalar_t>(),
|
||||
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
|
||||
vec_convergence.data_ptr<scalar_t>(),
|
||||
run_weight.data_ptr<scalar_t>(),
|
||||
run_vec_weight.data_ptr<scalar_t>(),
|
||||
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
|
||||
batch_size, horizon, compute_grad);
|
||||
}));
|
||||
}
|
||||
}
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {out_distance, out_position_distance, out_rotation_distance,
|
||||
distance_p_vector, distance_q_vector, out_gidx};
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
|
||||
const torch::Tensor grad_distance, // batch_size, 3
|
||||
const torch::Tensor grad_p_distance, // n_boxes, 3
|
||||
const torch::Tensor grad_q_distance,
|
||||
const torch::Tensor pose_weight,
|
||||
const torch::Tensor grad_p_vec, // n_boxes, 4, 4
|
||||
const torch::Tensor grad_q_vec, const int batch_size,
|
||||
const bool use_distance = false) {
|
||||
|
||||
// we compute the warp threads based on number of boxes:
|
||||
|
||||
// TODO: verify this math
|
||||
// const int batch_size = grad_distance.size(0);
|
||||
using namespace Curobo::Pose;
|
||||
|
||||
int threadsPerBlock = batch_size;
|
||||
if (batch_size > 128) {
|
||||
threadsPerBlock = 128;
|
||||
}
|
||||
|
||||
// we fit warp thread spheres in a threadsPerBlock
|
||||
|
||||
int blocksPerGrid = (batch_size + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (use_distance) {
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
grad_distance.scalar_type(), "backward_pose_distance", ([&] {
|
||||
backward_pose_distance_kernel<scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
out_grad_p.data_ptr<scalar_t>(),
|
||||
out_grad_q.data_ptr<scalar_t>(),
|
||||
grad_distance.data_ptr<scalar_t>(),
|
||||
grad_p_distance.data_ptr<scalar_t>(),
|
||||
grad_q_distance.data_ptr<scalar_t>(),
|
||||
pose_weight.data_ptr<scalar_t>(),
|
||||
grad_p_vec.data_ptr<scalar_t>(),
|
||||
grad_q_vec.data_ptr<scalar_t>(), batch_size);
|
||||
}));
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
grad_distance.scalar_type(), "backward_pose", ([&] {
|
||||
backward_pose_kernel<scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
out_grad_p.data_ptr<scalar_t>(),
|
||||
out_grad_q.data_ptr<scalar_t>(),
|
||||
grad_distance.data_ptr<scalar_t>(),
|
||||
pose_weight.data_ptr<scalar_t>(),
|
||||
grad_p_vec.data_ptr<scalar_t>(),
|
||||
grad_q_vec.data_ptr<scalar_t>(), batch_size);
|
||||
}));
|
||||
}
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {out_grad_p, out_grad_q};
|
||||
}
|
||||
643
src/curobo/curobolib/cpp/self_collision_kernel.cu
Normal file
643
src/curobo/curobolib/cpp/self_collision_kernel.cu
Normal file
@@ -0,0 +1,643 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include "helper_math.h"
|
||||
#include <cmath>
|
||||
#include <cuda_fp16.h>
|
||||
#include <vector>
|
||||
|
||||
#define SINGLE_GOAL 0
|
||||
#define BATCH_GOAL 1
|
||||
#define GOALSET 2
|
||||
#define BATCH_GOALSET 3
|
||||
|
||||
namespace Curobo {
|
||||
|
||||
namespace Geometry {
|
||||
|
||||
template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
|
||||
if (var < 0)
|
||||
return 0;
|
||||
else
|
||||
return var;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void self_collision_distance_kernel(
|
||||
scalar_t *out_distance, // batch x 1
|
||||
scalar_t *out_vec, // batch x nspheres x 4
|
||||
const scalar_t *robot_spheres, // batch x nspheres x 4
|
||||
const scalar_t *collision_threshold, const int batch_size,
|
||||
const int nspheres, const scalar_t *weight, const bool write_grad = false) {
|
||||
const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (batch_idx >= batch_size) {
|
||||
return;
|
||||
}
|
||||
float r_diff, distance;
|
||||
float max_penetration = 0;
|
||||
float3 sph1, sph2, dist_vec;
|
||||
int sph1_idx = -1;
|
||||
int sph2_idx = -1;
|
||||
// iterate over spheres:
|
||||
for (int i = 0; i < nspheres; i++) {
|
||||
sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
|
||||
for (int j = i + 1; j < nspheres; j++) {
|
||||
r_diff = collision_threshold[i * nspheres + j];
|
||||
if (isinf(r_diff)) {
|
||||
continue;
|
||||
}
|
||||
sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
|
||||
// compute sphere distance:
|
||||
distance = relu(r_diff - length(sph1 - sph2));
|
||||
if (distance > max_penetration) {
|
||||
max_penetration = distance;
|
||||
sph1_idx = i;
|
||||
sph2_idx = j;
|
||||
if (write_grad) {
|
||||
dist_vec = (sph1 - sph2) / distance;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// write out pose distance:
|
||||
if (max_penetration > 0) {
|
||||
out_distance[batch_idx] = weight[0] * max_penetration;
|
||||
if (write_grad) {
|
||||
*(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
|
||||
weight[0] * -1 * dist_vec;
|
||||
*(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
|
||||
weight[0] * dist_vec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
float d;
|
||||
int16_t i;
|
||||
int16_t j;
|
||||
} dist_t;
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// n warps per row
|
||||
// ndpt rows per warp
|
||||
///////////////////////////////////////////////////////////
|
||||
template <typename scalar_t>
|
||||
__global__ void self_collision_distance_kernel4(
|
||||
scalar_t *out_distance, // batch x 1
|
||||
scalar_t *out_vec, // batch x nspheres x 3
|
||||
const scalar_t *robot_spheres, // batch x nspheres x 3
|
||||
const scalar_t *offsets, const uint8_t *coll_matrix, const int batch_size,
|
||||
const int nspheres,
|
||||
const int ndpt, // number of distances to be computed per thread
|
||||
const int nwpr, // number of warps per row
|
||||
const scalar_t *weight, uint8_t *sparse_index,
|
||||
const bool write_grad = false) {
|
||||
int batch_idx = blockIdx.x;
|
||||
int warp_idx = threadIdx.x / 32;
|
||||
int i = ndpt * (warp_idx / nwpr); // starting row number for this warp
|
||||
int j = (warp_idx % nwpr) * 32; // starting column number for this warp
|
||||
|
||||
dist_t max_d = {.d = 0.0, .i = 0, .j = 0};
|
||||
__shared__ dist_t max_darr[32];
|
||||
|
||||
// Optimization: About 1/3 of the warps will have no work.
|
||||
// We compute distances only when i<j. If i is never <j in this warp, we have
|
||||
// no work
|
||||
if (i > j + 31) { // this warp has no work
|
||||
max_darr[warp_idx] = max_d;
|
||||
return;
|
||||
}
|
||||
|
||||
// load robot_spheres to shared memory
|
||||
extern __shared__ float4 __rs_shared[];
|
||||
if (threadIdx.x < nspheres) {
|
||||
float4 sph = *(float4 *)&robot_spheres[4 * (batch_idx * nspheres + threadIdx.x)];
|
||||
//float4 sph = make_float4(robot_spheres[3 * (batch_idx * nspheres + threadIdx.x)],
|
||||
//robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 1],
|
||||
//robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 2],
|
||||
//robot_spheres_radius[threadIdx.x]) ;
|
||||
sph.w += offsets[threadIdx.x];
|
||||
__rs_shared[threadIdx.x] = sph;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Compute distances and store the maximum per thread
|
||||
// in registers (max_d).
|
||||
// Each thread computes upto ndpt distances.
|
||||
// two warps per row
|
||||
//////////////////////////////////////////////////////
|
||||
// int nspheres_2 = nspheres * nspheres;
|
||||
|
||||
j = j + threadIdx.x % 32; // column number for this thread
|
||||
|
||||
float4 sph2;
|
||||
if (j < nspheres) {
|
||||
sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
|
||||
|
||||
#pragma unroll 16
|
||||
for (int k = 0; k < ndpt; k++, i++) { // increment i also here
|
||||
if (i < nspheres && j > i) {
|
||||
// check if self collision is allowed here:
|
||||
if (coll_matrix[i * nspheres + j] == 1) {
|
||||
|
||||
float4 sph1 = __rs_shared[i];
|
||||
if (sph1.w <= 0.0 || sph2.w <= 0.0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
float r_diff = sph1.w + sph2.w;
|
||||
float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
|
||||
(sph1.y - sph2.y) * (sph1.y - sph2.y) +
|
||||
(sph1.z - sph2.z) * (sph1.z - sph2.z));
|
||||
//float distance = max((float)0.0, (float)(r_diff - d));
|
||||
float distance = r_diff - d;
|
||||
// printf("%d, %d: (%d, %d) %f new\n", blockIdx.x, threadIdx.x, i, j,
|
||||
// distance);
|
||||
if (distance > max_d.d) {
|
||||
max_d.d = distance;
|
||||
max_d.i = i;
|
||||
max_d.j = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// max_d has the result max for this thread
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Reduce gridDim.x values using gridDim.x threads
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
// Perform warp-wide reductions
|
||||
// Optimization: Skip the reduction if all the values are zero
|
||||
unsigned zero_mask = __ballot_sync(
|
||||
0xffffffff, max_d.d != 0.0); // we expect most values to be 0. So,
|
||||
// zero_mask should be 0 in the common case.
|
||||
if (zero_mask != 0) { // some of the values are non-zero
|
||||
unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
|
||||
if (threadIdx.x < blockDim.x) {
|
||||
// dist_t max_d = dist_sh[threadIdx.x];
|
||||
for (int offset = 16; offset > 0; offset /= 2) {
|
||||
uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);
|
||||
dist_t d_temp = *(dist_t *)&nd;
|
||||
if (d_temp.d > max_d.d) {
|
||||
max_d = d_temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// thread0 in the warp has the max_d for the warp
|
||||
if (threadIdx.x % 32 == 0) {
|
||||
max_darr[warp_idx] = max_d;
|
||||
// printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
|
||||
// blockIdx.x, max_d);
|
||||
}
|
||||
|
||||
if (threadIdx.x < nspheres) {
|
||||
if (write_grad && sparse_index[batch_idx * nspheres + threadIdx.x] != 0) {
|
||||
|
||||
*(float4 *)&out_vec[batch_idx * nspheres * 4 + threadIdx.x * 4] =
|
||||
make_float4(0.0);
|
||||
sparse_index[batch_idx * nspheres + threadIdx.x] = 0;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dist_t max_d = max_darr[0];
|
||||
// TODO: This can be parallized
|
||||
for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
|
||||
if (max_darr[i].d > max_d.d) {
|
||||
max_d = max_darr[i];
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////
|
||||
// Write out the final results
|
||||
//////////////////////////////////////////////////////
|
||||
if (max_d.d != 0.0) {
|
||||
out_distance[batch_idx] = weight[0] * max_d.d;
|
||||
if (write_grad) {
|
||||
float3 sph1 =
|
||||
*(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.i)];
|
||||
float3 sph2 =
|
||||
*(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.j)];
|
||||
float3 dist_vec = (sph1 - sph2) / max_d.d;
|
||||
*(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.i * 4] =
|
||||
weight[0] * -1 * dist_vec;
|
||||
*(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.j * 4] =
|
||||
weight[0] * dist_vec;
|
||||
sparse_index[batch_idx * nspheres + max_d.i] = 1;
|
||||
sparse_index[batch_idx * nspheres + max_d.j] = 1;
|
||||
}
|
||||
} else {
|
||||
out_distance[batch_idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename scalar_t, int ndpt, int NBPB>
|
||||
__global__ void self_collision_distance_kernel7(
|
||||
scalar_t *out_distance, // batch x 1
|
||||
scalar_t *out_vec, // batch x nspheres x 3
|
||||
uint8_t *sparse_index,
|
||||
const scalar_t *robot_spheres, // batch x nspheres x 3
|
||||
const scalar_t *offsets, // nspheres
|
||||
const scalar_t *weight, const int16_t *locations_, const int batch_size,
|
||||
const int nspheres, const bool write_grad = false) {
|
||||
uint32_t batch_idx = blockIdx.x * NBPB;
|
||||
uint8_t nbpb = min(NBPB, batch_size - batch_idx);
|
||||
|
||||
if (nbpb == 0)
|
||||
return;
|
||||
|
||||
// Layout in shared memory:
|
||||
// sphere1[batch=0] sphere1[batch=1] sphere1[batch=2] sphere1[batch=4]
|
||||
// sphere2[batch=0] sphere2[batch=1] sphere2[batch=2] sphere2[batch=4]
|
||||
// ...
|
||||
extern __shared__ float4 __rs_shared[];
|
||||
if (threadIdx.x < nspheres) { // threadIdx.x is sphere index
|
||||
#pragma unroll
|
||||
for (int l = 0; l < nbpb; l++) {
|
||||
float4 sph = *(float4 *)&robot_spheres[4 * ((batch_idx + l) * nspheres + threadIdx.x)];
|
||||
|
||||
//float4 sph = make_float4(
|
||||
// robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x)],
|
||||
// robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 1],
|
||||
// robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 2],
|
||||
// robot_spheres_radius[threadIdx.x]
|
||||
//);
|
||||
|
||||
sph.w += offsets[threadIdx.x];
|
||||
__rs_shared[NBPB * threadIdx.x + l] = sph;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Compute distances and store the maximum per thread
|
||||
// in registers (max_d).
|
||||
// Each thread computes upto ndpt distances.
|
||||
//////////////////////////////////////////////////////
|
||||
dist_t max_d[NBPB] = {{.d = 0.0, .i = 0, .j = 0}};
|
||||
int16_t indices[ndpt * 2];
|
||||
for (uint8_t i =0; i< ndpt * 2; i++)
|
||||
{
|
||||
indices[i] = locations_[(threadIdx.x) * 2 * ndpt + i];
|
||||
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (uint8_t k = 0; k < ndpt; k++) {
|
||||
// We are iterating through ndpt pair of spheres across batch
|
||||
// if we increase ndpt, then we can compute for more spheres?
|
||||
int i = indices[k * 2];
|
||||
int j = indices[k * 2 + 1];
|
||||
if (i == -1 || j == -1)
|
||||
continue;
|
||||
|
||||
#pragma unroll
|
||||
for (uint16_t l = 0; l < nbpb; l++) { // iterate through nbpb batches
|
||||
float4 sph1 = __rs_shared[NBPB * i + l];
|
||||
float4 sph2 = __rs_shared[NBPB * j + l];
|
||||
if (sph1.w <= 0.0 || sph2.w <= 0.0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
float r_diff =
|
||||
sph1.w + sph2.w; // sum of two radii, radii include respective offsets
|
||||
float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
|
||||
(sph1.y - sph2.y) * (sph1.y - sph2.y) +
|
||||
(sph1.z - sph2.z) * (sph1.z - sph2.z));
|
||||
float f_diff = r_diff - d;
|
||||
if (f_diff > max_d[l].d) {
|
||||
max_d[l].d = f_diff;
|
||||
max_d[l].i = i;
|
||||
max_d[l].j = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
// max_d has the result max for this thread
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Reduce gridDim.x values using gridDim.x threads
|
||||
//////////////////////////////////////////////////////
|
||||
// We find the sum across 32 threads. Hence, we are limited to running all our self collision
|
||||
// distances for a batch_idx to 32 threads.
|
||||
|
||||
__shared__ dist_t max_darr[32 * NBPB];
|
||||
|
||||
#pragma unroll
|
||||
for (uint8_t l = 0; l < nbpb; l++) {
|
||||
// Perform warp-wide reductions
|
||||
// Optimization: Skip the reduction if all the values are zero
|
||||
unsigned zero_mask = __ballot_sync(
|
||||
0xffffffff,
|
||||
max_d[l].d != 0.0); // we expect most values to be 0. So, zero_mask
|
||||
// should be 0 in the common case.
|
||||
if (zero_mask != 0) { // some of the values are non-zero
|
||||
unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
|
||||
if (threadIdx.x < blockDim.x) {
|
||||
// dist_t max_d = dist_sh[threadIdx.x];
|
||||
for (int offset = 16; offset > 0; offset /= 2) {
|
||||
// the offset here is linked to ndpt?
|
||||
uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d[l], offset);
|
||||
dist_t d_temp = *(dist_t *)&nd;
|
||||
if (d_temp.d > max_d[l].d) {
|
||||
max_d[l] = d_temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// thread0 in the warp has the max_d for the warp
|
||||
if (threadIdx.x % 32 == 0) {
|
||||
max_darr[(threadIdx.x / 32) + 32 * l] = max_d[l];
|
||||
// printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
|
||||
// blockIdx.x, max_d);
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x < nspheres) {
|
||||
for (int l = 0; l < nbpb; l++) {
|
||||
|
||||
if (write_grad &&
|
||||
sparse_index[(batch_idx + l) * nspheres + threadIdx.x] != 0) {
|
||||
*(float4 *)&out_vec[(batch_idx + l) * nspheres * 4 + threadIdx.x * 4] =
|
||||
make_float4(0.0);
|
||||
sparse_index[(batch_idx + l) * nspheres + threadIdx.x] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
#pragma unroll
|
||||
for (uint8_t l = 0; l < nbpb; l++) {
|
||||
dist_t max_d = max_darr[l * 32];
|
||||
// TODO: This can be parallized
|
||||
for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
|
||||
if (max_darr[l * 32 + i].d > max_d.d) {
|
||||
max_d = max_darr[l * 32 + i];
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////
|
||||
// Write out the final results
|
||||
//////////////////////////////////////////////////////
|
||||
if (max_d.d != 0.0) {
|
||||
out_distance[batch_idx + l] = weight[0] * max_d.d;
|
||||
if (write_grad) {
|
||||
float3 sph1 =
|
||||
*(float3 *)&robot_spheres[4 *
|
||||
((batch_idx + l) * nspheres + max_d.i)];
|
||||
float3 sph2 =
|
||||
*(float3 *)&robot_spheres[4 *
|
||||
((batch_idx + l) * nspheres + max_d.j)];
|
||||
float3 dist_vec = (sph1 - sph2) / max_d.d;
|
||||
*(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.i * 4] =
|
||||
weight[0] * -1 * dist_vec;
|
||||
*(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.j * 4] =
|
||||
weight[0] * dist_vec;
|
||||
sparse_index[(batch_idx + l) * nspheres + max_d.i] = 1;
|
||||
sparse_index[(batch_idx + l) * nspheres + max_d.j] = 1;
|
||||
}
|
||||
} else {
|
||||
out_distance[batch_idx + l] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace Geometry
|
||||
} // namespace Curobo
|
||||
|
||||
// This is the best version so far.
|
||||
// It precomputes the start addresses per thread on the cpu.
|
||||
// The rest is similar to the version above.
|
||||
std::vector<torch::Tensor> self_collision_distance(
|
||||
torch::Tensor out_distance, torch::Tensor out_vec,
|
||||
torch::Tensor sparse_index,
|
||||
const torch::Tensor robot_spheres, // batch_size x n_spheres x 3
|
||||
const torch::Tensor collision_offset, // n_spheres x n_spheres
|
||||
const torch::Tensor weight, const torch::Tensor collision_matrix,
|
||||
const torch::Tensor thread_locations, const int thread_locations_size,
|
||||
const int batch_size, const int nspheres, const bool compute_grad = false,
|
||||
const int ndpt = 8, // Does this need to match template?
|
||||
const bool experimental_kernel = false) {
|
||||
using namespace Curobo::Geometry;
|
||||
|
||||
// use efficient kernel based on number of threads:
|
||||
const int nbpb = 1;
|
||||
|
||||
assert(nspheres < 1024);
|
||||
|
||||
|
||||
int threadsPerBlock = ((thread_locations_size / 2) + ndpt - 1) /
|
||||
ndpt; // location_size must be an even number. We store
|
||||
// i,j for each sphere pair.
|
||||
//assert(threadsPerBlock/nbpb <=32);
|
||||
if (threadsPerBlock < 32*nbpb)
|
||||
{
|
||||
threadsPerBlock = 32 * nbpb;
|
||||
}
|
||||
if (threadsPerBlock < nspheres)
|
||||
{
|
||||
threadsPerBlock = nspheres;
|
||||
}
|
||||
int blocksPerGrid = (batch_size + nbpb - 1) / nbpb; // number of batches per block
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
// ((threadsPerBlock >= nspheres))&&
|
||||
if (experimental_kernel) {
|
||||
|
||||
int smemSize = nbpb * nspheres * sizeof(float4);
|
||||
|
||||
|
||||
|
||||
if (ndpt == 1)
|
||||
{
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 1, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
|
||||
else if (ndpt == 2)
|
||||
{
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 2, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
|
||||
else if (ndpt == 4)
|
||||
{
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 4, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
else if (ndpt == 8)
|
||||
{
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 8, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
else if (ndpt == 32)
|
||||
{
|
||||
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 32, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
else if (ndpt == 64)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 64, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
else if (ndpt == 128)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 128, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
|
||||
}
|
||||
else if (ndpt == 512)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel7<scalar_t, 512, nbpb>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(),
|
||||
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
|
||||
compute_grad);
|
||||
}));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false); // only ndpt of 32 or 64 is currently supported.
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
int ndpt_n = 32; // number of distances to be computed per thread
|
||||
int nwpr = (nspheres + 31) / 32;
|
||||
int warpsPerBlock = nwpr * ((nspheres + ndpt_n - 1) / ndpt_n);
|
||||
threadsPerBlock = warpsPerBlock * 32;
|
||||
blocksPerGrid = batch_size;
|
||||
// printf("Blocks: %d, threads/block:%d, ndpt=%d, nwpr=%d\n", blocksPerGrid,
|
||||
// threadsPerBlock, ndpt, nwpr);
|
||||
|
||||
int smemSize = nspheres * sizeof(float4);
|
||||
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
|
||||
self_collision_distance_kernel4<scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
|
||||
out_distance.data_ptr<scalar_t>(),
|
||||
out_vec.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
collision_offset.data_ptr<scalar_t>(),
|
||||
collision_matrix.data_ptr<uint8_t>(), batch_size, nspheres,
|
||||
ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
|
||||
sparse_index.data_ptr<uint8_t>(), compute_grad);
|
||||
}));
|
||||
}
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {out_distance, out_vec, sparse_index};
|
||||
}
|
||||
|
||||
1567
src/curobo/curobolib/cpp/sphere_obb_kernel.cu
Normal file
1567
src/curobo/curobolib/cpp/sphere_obb_kernel.cu
Normal file
File diff suppressed because it is too large
Load Diff
257
src/curobo/curobolib/cpp/tensor_step_cuda.cpp
Normal file
257
src/curobo/curobolib/cpp/tensor_step_cuda.cpp
Normal file
@@ -0,0 +1,257 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <vector>
|
||||
|
||||
std::vector<torch::Tensor> step_position_clique(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_position, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor traj_dt, const int batch_size, const int horizon,
|
||||
const int dof);
|
||||
std::vector<torch::Tensor> step_position_clique2(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_position, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor traj_dt, const int batch_size, const int horizon,
|
||||
const int dof, const int mode);
|
||||
std::vector<torch::Tensor> step_position_clique2_idx(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_position, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor start_idx, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof, const int mode);
|
||||
|
||||
std::vector<torch::Tensor> backward_step_position_clique(
|
||||
torch::Tensor out_grad_position, const torch::Tensor grad_position,
|
||||
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
|
||||
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof);
|
||||
std::vector<torch::Tensor> backward_step_position_clique2(
|
||||
torch::Tensor out_grad_position, const torch::Tensor grad_position,
|
||||
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
|
||||
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof, const int mode);
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_acc, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity,
|
||||
const torch::Tensor start_acceleration,
|
||||
const torch::Tensor traj_dt, const int batch_size,
|
||||
const int horizon, const int dof, const bool use_rk2 = true);
|
||||
|
||||
std::vector<torch::Tensor> step_acceleration_idx(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_acc, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor start_idx, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof,
|
||||
const bool use_rk2 = true);
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
std::vector<torch::Tensor> step_position_clique_wrapper(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_position, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor traj_dt, const int batch_size, const int horizon,
|
||||
const int dof) {
|
||||
const at::cuda::OptionalCUDAGuard guard(u_position.device());
|
||||
|
||||
CHECK_INPUT(u_position);
|
||||
CHECK_INPUT(out_position);
|
||||
CHECK_INPUT(out_velocity);
|
||||
CHECK_INPUT(out_acceleration);
|
||||
CHECK_INPUT(out_jerk);
|
||||
CHECK_INPUT(start_position);
|
||||
CHECK_INPUT(start_velocity);
|
||||
CHECK_INPUT(start_acceleration);
|
||||
CHECK_INPUT(traj_dt);
|
||||
|
||||
return step_position_clique(out_position, out_velocity, out_acceleration,
|
||||
out_jerk, u_position, start_position,
|
||||
start_velocity, start_acceleration, traj_dt,
|
||||
batch_size, horizon, dof);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> step_position_clique2_wrapper(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_position, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor traj_dt, const int batch_size, const int horizon,
|
||||
const int dof,
|
||||
const int mode) {
|
||||
const at::cuda::OptionalCUDAGuard guard(u_position.device());
|
||||
|
||||
CHECK_INPUT(u_position);
|
||||
CHECK_INPUT(out_position);
|
||||
CHECK_INPUT(out_velocity);
|
||||
CHECK_INPUT(out_acceleration);
|
||||
CHECK_INPUT(out_jerk);
|
||||
CHECK_INPUT(start_position);
|
||||
CHECK_INPUT(start_velocity);
|
||||
CHECK_INPUT(start_acceleration);
|
||||
CHECK_INPUT(traj_dt);
|
||||
|
||||
return step_position_clique2(out_position, out_velocity, out_acceleration,
|
||||
out_jerk, u_position, start_position,
|
||||
start_velocity, start_acceleration, traj_dt,
|
||||
batch_size, horizon, dof, mode);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_position, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor start_idx, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof,
|
||||
const int mode) {
|
||||
const at::cuda::OptionalCUDAGuard guard(u_position.device());
|
||||
|
||||
CHECK_INPUT(u_position);
|
||||
CHECK_INPUT(out_position);
|
||||
CHECK_INPUT(out_velocity);
|
||||
CHECK_INPUT(out_acceleration);
|
||||
CHECK_INPUT(out_jerk);
|
||||
CHECK_INPUT(start_position);
|
||||
CHECK_INPUT(start_velocity);
|
||||
CHECK_INPUT(start_acceleration);
|
||||
CHECK_INPUT(traj_dt);
|
||||
CHECK_INPUT(start_idx);
|
||||
|
||||
return step_position_clique2_idx(
|
||||
out_position, out_velocity, out_acceleration, out_jerk, u_position,
|
||||
start_position, start_velocity, start_acceleration, start_idx, traj_dt,
|
||||
batch_size, horizon, dof, mode);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> backward_step_position_clique_wrapper(
|
||||
torch::Tensor out_grad_position, const torch::Tensor grad_position,
|
||||
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
|
||||
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof) {
|
||||
const at::cuda::OptionalCUDAGuard guard(grad_position.device());
|
||||
|
||||
CHECK_INPUT(out_grad_position);
|
||||
CHECK_INPUT(grad_position);
|
||||
CHECK_INPUT(grad_velocity);
|
||||
CHECK_INPUT(grad_acceleration);
|
||||
CHECK_INPUT(grad_jerk);
|
||||
CHECK_INPUT(traj_dt);
|
||||
|
||||
return backward_step_position_clique(
|
||||
out_grad_position, grad_position, grad_velocity, grad_acceleration,
|
||||
grad_jerk, traj_dt, batch_size, horizon, dof);
|
||||
}
|
||||
std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
|
||||
torch::Tensor out_grad_position, const torch::Tensor grad_position,
|
||||
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
|
||||
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof,
|
||||
const int mode) {
|
||||
const at::cuda::OptionalCUDAGuard guard(grad_position.device());
|
||||
|
||||
CHECK_INPUT(out_grad_position);
|
||||
CHECK_INPUT(grad_position);
|
||||
CHECK_INPUT(grad_velocity);
|
||||
CHECK_INPUT(grad_acceleration);
|
||||
CHECK_INPUT(grad_jerk);
|
||||
CHECK_INPUT(traj_dt);
|
||||
|
||||
return backward_step_position_clique2(
|
||||
out_grad_position, grad_position, grad_velocity, grad_acceleration,
|
||||
grad_jerk, traj_dt, batch_size, horizon, dof, mode);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> step_acceleration_wrapper(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_acc, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor traj_dt, const int batch_size, const int horizon,
|
||||
const int dof, const bool use_rk2 = true) {
|
||||
const at::cuda::OptionalCUDAGuard guard(u_acc.device());
|
||||
|
||||
CHECK_INPUT(u_acc);
|
||||
CHECK_INPUT(out_position);
|
||||
CHECK_INPUT(out_velocity);
|
||||
CHECK_INPUT(out_acceleration);
|
||||
CHECK_INPUT(out_jerk);
|
||||
CHECK_INPUT(start_position);
|
||||
CHECK_INPUT(start_velocity);
|
||||
CHECK_INPUT(start_acceleration);
|
||||
CHECK_INPUT(traj_dt);
|
||||
|
||||
return step_acceleration(out_position, out_velocity, out_acceleration,
|
||||
out_jerk, u_acc, start_position, start_velocity,
|
||||
start_acceleration, traj_dt, batch_size, horizon,
|
||||
dof, use_rk2);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> step_acceleration_idx_wrapper(
|
||||
torch::Tensor out_position, torch::Tensor out_velocity,
|
||||
torch::Tensor out_acceleration, torch::Tensor out_jerk,
|
||||
const torch::Tensor u_acc, const torch::Tensor start_position,
|
||||
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
|
||||
const torch::Tensor start_idx, const torch::Tensor traj_dt,
|
||||
const int batch_size, const int horizon, const int dof,
|
||||
const bool use_rk2 = true) {
|
||||
const at::cuda::OptionalCUDAGuard guard(u_acc.device());
|
||||
|
||||
CHECK_INPUT(u_acc);
|
||||
CHECK_INPUT(out_position);
|
||||
CHECK_INPUT(out_velocity);
|
||||
CHECK_INPUT(out_acceleration);
|
||||
CHECK_INPUT(out_jerk);
|
||||
CHECK_INPUT(start_position);
|
||||
CHECK_INPUT(start_velocity);
|
||||
CHECK_INPUT(start_acceleration);
|
||||
CHECK_INPUT(start_idx);
|
||||
CHECK_INPUT(traj_dt);
|
||||
|
||||
return step_acceleration_idx(out_position, out_velocity, out_acceleration,
|
||||
out_jerk, u_acc, start_position, start_velocity,
|
||||
start_acceleration, start_idx, traj_dt,
|
||||
batch_size, horizon, dof, use_rk2);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("step_position", &step_position_clique_wrapper,
|
||||
"Tensor Step Position (curobolib)");
|
||||
m.def("step_position2", &step_position_clique2_wrapper,
|
||||
"Tensor Step Position (curobolib)");
|
||||
m.def("step_idx_position2", &step_position_clique2_idx_wrapper,
|
||||
"Tensor Step Position (curobolib)");
|
||||
m.def("step_position_backward", &backward_step_position_clique_wrapper,
|
||||
"Tensor Step Position (curobolib)");
|
||||
m.def("step_position_backward2", &backward_step_position_clique2_wrapper,
|
||||
"Tensor Step Position (curobolib)");
|
||||
m.def("step_acceleration", &step_acceleration_wrapper,
|
||||
"Tensor Step Acceleration (curobolib)");
|
||||
m.def("step_acceleration_idx", &step_acceleration_idx_wrapper,
|
||||
"Tensor Step Acceleration (curobolib)");
|
||||
}
|
||||
1692
src/curobo/curobolib/cpp/tensor_step_kernel.cu
Normal file
1692
src/curobo/curobolib/cpp/tensor_step_kernel.cu
Normal file
File diff suppressed because it is too large
Load Diff
126
src/curobo/curobolib/cpp/update_best_kernel.cu
Normal file
126
src/curobo/curobolib/cpp/update_best_kernel.cu
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <cuda.h>
|
||||
#include <torch/extension.h>
|
||||
#include <vector>
|
||||
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
// #include "helper_cuda.h"
|
||||
#include "helper_math.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <cub/cub.cuh>
|
||||
#include <math.h>
|
||||
|
||||
namespace Curobo {
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
// We launch with d_opt*cost_s1 threads.
|
||||
// We assume that cost_s2 is always 1.
|
||||
template <typename scalar_t>
|
||||
__global__ void update_best_kernel(scalar_t *best_cost, // 200x1
|
||||
scalar_t *best_q, // 200x7
|
||||
int16_t *best_iteration, // 200 x 1
|
||||
int16_t *current_iteration, // 1
|
||||
const scalar_t *cost, // 200x1
|
||||
const scalar_t *q, // 200x7
|
||||
const int d_opt, // 7
|
||||
const int cost_s1, // 200
|
||||
const int cost_s2,
|
||||
const int iteration,
|
||||
const float delta_threshold,
|
||||
const float relative_threshold) // 1
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int size = cost_s1 * d_opt; // size of best_q
|
||||
if (tid >= size) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int cost_idx = tid / d_opt;
|
||||
const float cost_new = cost[cost_idx];
|
||||
const float best_cost_in = best_cost[cost_idx];
|
||||
const bool change = (best_cost_in - cost_new) > delta_threshold && cost_new < best_cost_in * relative_threshold;
|
||||
if (change) {
|
||||
best_q[tid] = q[tid]; // update best_q
|
||||
|
||||
if (tid % d_opt == 0) {
|
||||
best_cost[cost_idx] = cost_new ; // update best_cost
|
||||
//best_iteration[cost_idx] = curr_iter + iteration; //
|
||||
// this tensor keeps track of whether the cost reduced by at least
|
||||
// delta_threshold.
|
||||
// here iteration is the last_best parameter.
|
||||
}
|
||||
}
|
||||
|
||||
if (tid % d_opt == 0)
|
||||
{
|
||||
if (change)
|
||||
{
|
||||
best_iteration[cost_idx] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
best_iteration[cost_idx] -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
//.if (tid == 0)
|
||||
//{
|
||||
// curr_iter += 1;
|
||||
// current_iteration[0] = curr_iter;
|
||||
//}
|
||||
|
||||
}
|
||||
} // namespace Optimization
|
||||
} // namespace Curobo
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
|
||||
torch::Tensor best_iteration,
|
||||
torch::Tensor current_iteration,
|
||||
const torch::Tensor cost,
|
||||
const torch::Tensor q, const int d_opt, const int cost_s1,
|
||||
const int cost_s2, const int iteration,
|
||||
const float delta_threshold,
|
||||
const float relative_threshold = 0.999) {
|
||||
using namespace Curobo::Optimization;
|
||||
const int threadsPerBlock = 128;
|
||||
const int cost_size = cost_s1 * d_opt;
|
||||
assert(cost_s2 == 1); // assumption
|
||||
const int blocksPerGrid = (cost_size + threadsPerBlock - 1) / threadsPerBlock;
|
||||
// printf("cost_s1=%d, d_opt=%d, blocksPerGrid=%d\n", cost_s1, d_opt,
|
||||
// blocksPerGrid);
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
cost.scalar_type(), "update_best_cu", ([&] {
|
||||
update_best_kernel<scalar_t>
|
||||
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||
best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
|
||||
best_iteration.data_ptr<int16_t>(),
|
||||
current_iteration.data_ptr<int16_t>(),
|
||||
cost.data_ptr<scalar_t>(),
|
||||
q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
|
||||
delta_threshold, relative_threshold);
|
||||
}));
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return {best_cost, best_q, best_iteration};
|
||||
}
|
||||
405
src/curobo/curobolib/geom.py
Normal file
405
src/curobo/curobolib/geom.py
Normal file
@@ -0,0 +1,405 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
# Third Party
|
||||
import torch
|
||||
|
||||
# CuRobo
|
||||
from curobo.util.logger import log_warn
|
||||
|
||||
try:
|
||||
# CuRobo
|
||||
from curobo.curobolib import geom_cu
|
||||
|
||||
except ImportError:
|
||||
log_warn("geom_cu binary not found, jit compiling...")
|
||||
# Third Party
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
# CuRobo
|
||||
from curobo.util_file import add_cpp_path
|
||||
|
||||
geom_cu = load(
|
||||
name="geom_cu",
|
||||
sources=add_cpp_path(
|
||||
[
|
||||
"geom_cuda.cpp",
|
||||
"sphere_obb_kernel.cu",
|
||||
"pose_distance_kernel.cu",
|
||||
"self_collision_kernel.cu",
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def get_self_collision_distance(
|
||||
out_distance,
|
||||
out_vec,
|
||||
sparse_index,
|
||||
robot_spheres,
|
||||
collision_offset,
|
||||
weight,
|
||||
coll_matrix,
|
||||
thread_locations,
|
||||
thread_size,
|
||||
b_size,
|
||||
nspheres,
|
||||
compute_grad,
|
||||
checks_per_thread=32,
|
||||
experimental_kernel=True,
|
||||
):
|
||||
r = geom_cu.self_collision_distance(
|
||||
out_distance,
|
||||
out_vec,
|
||||
sparse_index,
|
||||
robot_spheres,
|
||||
collision_offset,
|
||||
weight,
|
||||
coll_matrix,
|
||||
thread_locations,
|
||||
thread_size,
|
||||
b_size,
|
||||
nspheres,
|
||||
compute_grad,
|
||||
checks_per_thread,
|
||||
experimental_kernel,
|
||||
)
|
||||
|
||||
out_distance = r[0]
|
||||
out_vec = r[1]
|
||||
return out_distance, out_vec
|
||||
|
||||
|
||||
class SelfCollisionDistance(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
out_distance,
|
||||
out_vec,
|
||||
sparse_idx,
|
||||
robot_spheres,
|
||||
sphere_offset,
|
||||
weight,
|
||||
coll_matrix,
|
||||
thread_locations,
|
||||
max_thread,
|
||||
checks_per_thread: int,
|
||||
experimental_kernel: bool,
|
||||
return_loss: bool = False,
|
||||
):
|
||||
# get batch size
|
||||
b, h, n_spheres, _ = robot_spheres.shape
|
||||
out_distance, out_vec = get_self_collision_distance(
|
||||
out_distance,
|
||||
out_vec,
|
||||
sparse_idx,
|
||||
robot_spheres, # .view(-1, 4),
|
||||
sphere_offset,
|
||||
weight,
|
||||
coll_matrix,
|
||||
thread_locations,
|
||||
max_thread,
|
||||
b * h,
|
||||
n_spheres,
|
||||
robot_spheres.requires_grad,
|
||||
checks_per_thread,
|
||||
experimental_kernel,
|
||||
)
|
||||
ctx.return_loss = return_loss
|
||||
ctx.save_for_backward(out_vec)
|
||||
return out_distance
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out_distance):
|
||||
sphere_grad = None
|
||||
if ctx.needs_input_grad[3]:
|
||||
(g_vec,) = ctx.saved_tensors
|
||||
if ctx.return_loss:
|
||||
g_vec = g_vec * grad_out_distance.unsqueeze(1)
|
||||
sphere_grad = g_vec
|
||||
return None, None, None, sphere_grad, None, None, None, None, None, None, None, None
|
||||
|
||||
|
||||
class SelfCollisionDistanceLoss(SelfCollisionDistance):
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out_distance):
|
||||
sphere_grad = None
|
||||
if ctx.needs_input_grad[3]:
|
||||
(g_vec,) = ctx.saved_tensors
|
||||
sphere_grad = g_vec * grad_out_distance.unsqueeze(1)
|
||||
return None, None, None, sphere_grad, None, None, None, None, None, None, None
|
||||
|
||||
|
||||
def get_pose_distance(
|
||||
out_distance,
|
||||
out_position_distance,
|
||||
out_rotation_distance,
|
||||
out_p_vec,
|
||||
out_q_vec,
|
||||
out_idx,
|
||||
current_position,
|
||||
goal_position,
|
||||
current_quat,
|
||||
goal_quat,
|
||||
vec_weight,
|
||||
weight,
|
||||
vec_convergence,
|
||||
run_weight,
|
||||
run_vec_weight,
|
||||
batch_pose_idx,
|
||||
batch_size,
|
||||
horizon,
|
||||
mode=1,
|
||||
num_goals=1,
|
||||
write_grad=False,
|
||||
write_distance=False,
|
||||
use_metric=False,
|
||||
):
|
||||
if batch_pose_idx.shape[0] != batch_size:
|
||||
raise ValueError("Index buffer size is different from batch size")
|
||||
|
||||
r = geom_cu.pose_distance(
|
||||
out_distance,
|
||||
out_position_distance,
|
||||
out_rotation_distance,
|
||||
out_p_vec,
|
||||
out_q_vec,
|
||||
out_idx,
|
||||
current_position,
|
||||
goal_position.view(-1),
|
||||
current_quat,
|
||||
goal_quat.view(-1),
|
||||
vec_weight,
|
||||
weight,
|
||||
vec_convergence,
|
||||
run_weight,
|
||||
run_vec_weight,
|
||||
batch_pose_idx,
|
||||
batch_size,
|
||||
horizon,
|
||||
mode,
|
||||
num_goals,
|
||||
write_grad,
|
||||
write_distance,
|
||||
use_metric,
|
||||
)
|
||||
|
||||
out_distance = r[0]
|
||||
out_position_distance = r[1]
|
||||
out_rotation_distance = r[2]
|
||||
|
||||
out_p_vec = r[3]
|
||||
out_q_vec = r[4]
|
||||
|
||||
out_idx = r[5]
|
||||
return out_distance, out_position_distance, out_rotation_distance, out_p_vec, out_q_vec, out_idx
|
||||
|
||||
|
||||
def get_pose_distance_backward(
|
||||
out_grad_p,
|
||||
out_grad_q,
|
||||
grad_distance,
|
||||
grad_p_distance,
|
||||
grad_q_distance,
|
||||
pose_weight,
|
||||
grad_p_vec,
|
||||
grad_q_vec,
|
||||
batch_size,
|
||||
use_distance=False,
|
||||
):
|
||||
r = geom_cu.pose_distance_backward(
|
||||
out_grad_p,
|
||||
out_grad_q,
|
||||
grad_distance,
|
||||
grad_p_distance,
|
||||
grad_q_distance,
|
||||
pose_weight,
|
||||
grad_p_vec,
|
||||
grad_q_vec,
|
||||
batch_size,
|
||||
use_distance,
|
||||
)
|
||||
return r[0], r[1]
|
||||
|
||||
|
||||
class SdfSphereOBB(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
query_sphere,
|
||||
out_buffer,
|
||||
grad_out_buffer,
|
||||
sparsity_idx,
|
||||
weight,
|
||||
activation_distance,
|
||||
box_accel,
|
||||
box_dims,
|
||||
box_pose,
|
||||
box_enable,
|
||||
n_env_obb,
|
||||
env_query_idx,
|
||||
max_nobs,
|
||||
batch_size,
|
||||
horizon,
|
||||
n_spheres,
|
||||
transform_back,
|
||||
compute_distance,
|
||||
use_batch_env,
|
||||
return_loss: bool = False,
|
||||
):
|
||||
r = geom_cu.closest_point(
|
||||
query_sphere,
|
||||
out_buffer,
|
||||
grad_out_buffer,
|
||||
sparsity_idx,
|
||||
weight,
|
||||
activation_distance,
|
||||
box_accel,
|
||||
box_dims,
|
||||
box_pose,
|
||||
box_enable,
|
||||
n_env_obb,
|
||||
env_query_idx,
|
||||
max_nobs,
|
||||
batch_size,
|
||||
horizon,
|
||||
n_spheres,
|
||||
transform_back,
|
||||
compute_distance,
|
||||
use_batch_env,
|
||||
)
|
||||
# r[1][r[1]!=r[1]] = 0.0
|
||||
ctx.return_loss = return_loss
|
||||
ctx.save_for_backward(r[1])
|
||||
return r[0]
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
grad_pt = None
|
||||
if ctx.needs_input_grad[0]:
|
||||
(r,) = ctx.saved_tensors
|
||||
if ctx.return_loss:
|
||||
r = r * grad_output.unsqueeze(-1)
|
||||
grad_pt = r
|
||||
return (
|
||||
grad_pt,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
class SdfSweptSphereOBB(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
query_sphere,
|
||||
out_buffer,
|
||||
grad_out_buffer,
|
||||
sparsity_idx,
|
||||
weight,
|
||||
activation_distance,
|
||||
speed_dt,
|
||||
box_accel,
|
||||
box_dims,
|
||||
box_pose,
|
||||
box_enable,
|
||||
n_env_obb,
|
||||
env_query_idx,
|
||||
max_nobs,
|
||||
batch_size,
|
||||
horizon,
|
||||
n_spheres,
|
||||
sweep_steps,
|
||||
enable_speed_metric,
|
||||
transform_back,
|
||||
compute_distance,
|
||||
use_batch_env,
|
||||
return_loss: bool = False,
|
||||
):
|
||||
r = geom_cu.swept_closest_point(
|
||||
query_sphere,
|
||||
out_buffer,
|
||||
grad_out_buffer,
|
||||
sparsity_idx,
|
||||
weight,
|
||||
activation_distance,
|
||||
speed_dt,
|
||||
box_accel,
|
||||
box_dims,
|
||||
box_pose,
|
||||
box_enable,
|
||||
n_env_obb,
|
||||
env_query_idx,
|
||||
max_nobs,
|
||||
batch_size,
|
||||
horizon,
|
||||
n_spheres,
|
||||
sweep_steps,
|
||||
enable_speed_metric,
|
||||
transform_back,
|
||||
compute_distance,
|
||||
use_batch_env,
|
||||
)
|
||||
ctx.return_loss = return_loss
|
||||
ctx.save_for_backward(
|
||||
r[1],
|
||||
)
|
||||
return r[0]
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
grad_pt = None
|
||||
if ctx.needs_input_grad[0]:
|
||||
(r,) = ctx.saved_tensors
|
||||
if ctx.return_loss:
|
||||
r = r * grad_output.unsqueeze(-1)
|
||||
grad_pt = r
|
||||
return (
|
||||
grad_pt,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
498
src/curobo/curobolib/kinematics.py
Normal file
498
src/curobo/curobolib/kinematics.py
Normal file
@@ -0,0 +1,498 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
# Third Party
|
||||
import torch
|
||||
from torch.autograd import Function
|
||||
|
||||
# CuRobo
|
||||
from curobo.util.logger import log_warn
|
||||
|
||||
try:
|
||||
# CuRobo
|
||||
from curobo.curobolib import kinematics_fused_cu
|
||||
except ImportError:
|
||||
log_warn("kinematics_fused_cu not found, JIT compiling...")
|
||||
# Third Party
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
# CuRobo
|
||||
from curobo.curobolib.util_file import add_cpp_path
|
||||
|
||||
kinematics_fused_cu = load(
|
||||
name="kinematics_fused_cu",
|
||||
sources=add_cpp_path(
|
||||
[
|
||||
"kinematics_fused_cuda.cpp",
|
||||
"kinematics_fused_kernel.cu",
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def rotation_matrix_to_quaternion(in_mat, out_quat):
|
||||
r = kinematics_fused_cu.matrix_to_quaternion(out_quat, in_mat.reshape(-1, 9))
|
||||
return r[0]
|
||||
|
||||
|
||||
class KinematicsFusedFunction(Function):
|
||||
@staticmethod
|
||||
def _call_cuda(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
r = kinematics_fused_cu.forward(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
False,
|
||||
)
|
||||
out_link_pos = r[0]
|
||||
out_link_quat = r[1]
|
||||
out_spheres = r[2]
|
||||
global_cumul_mat = r[3]
|
||||
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
|
||||
|
||||
@staticmethod
|
||||
def _call_backward_cuda(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
sparsity_opt=True,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
if grad_out.is_contiguous():
|
||||
grad_out = grad_out.view(-1)
|
||||
else:
|
||||
grad_out = grad_out.reshape(-1)
|
||||
r = kinematics_fused_cu.backward(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
sparsity_opt,
|
||||
False,
|
||||
)
|
||||
out_q = r[0].view(b_size, -1)
|
||||
|
||||
return out_q
|
||||
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
# link_mats: torch.Tensor,
|
||||
link_pos: torch.Tensor,
|
||||
link_quat: torch.Tensor,
|
||||
b_robot_spheres: torch.tensor,
|
||||
global_cumul_mat: torch.Tensor,
|
||||
joint_seq: torch.Tensor,
|
||||
fixed_transform: torch.tensor,
|
||||
robot_spheres: torch.tensor,
|
||||
link_map: torch.tensor,
|
||||
joint_map: torch.Tensor,
|
||||
joint_map_type: torch.Tensor,
|
||||
store_link_map: torch.Tensor,
|
||||
link_sphere_map: torch.Tensor,
|
||||
link_chain_map: torch.Tensor,
|
||||
grad_out: torch.Tensor,
|
||||
):
|
||||
link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
|
||||
link_pos,
|
||||
link_quat,
|
||||
b_robot_spheres,
|
||||
global_cumul_mat,
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
)
|
||||
ctx.save_for_backward(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
)
|
||||
|
||||
return link_pos, link_quat, b_robot_spheres
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
|
||||
grad_joint = None
|
||||
|
||||
if ctx.needs_input_grad[4]:
|
||||
(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
) = ctx.saved_tensors
|
||||
grad_joint = KinematicsFusedFunction._call_backward_cuda(
|
||||
grad_out,
|
||||
grad_out_link_pos,
|
||||
grad_out_link_quat,
|
||||
grad_out_spheres,
|
||||
global_cumul_mat,
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
True,
|
||||
)
|
||||
|
||||
return (
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
grad_joint,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
class KinematicsFusedGlobalCumulFunction(Function):
|
||||
@staticmethod
|
||||
def _call_cuda(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
# print(n_spheres)
|
||||
# print(angle)
|
||||
r = kinematics_fused_cu.forward(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
True,
|
||||
)
|
||||
out_link_pos = r[0]
|
||||
out_link_quat = r[1]
|
||||
out_spheres = r[2]
|
||||
global_cumul_mat = r[3]
|
||||
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
|
||||
|
||||
@staticmethod
|
||||
def _call_backward_cuda(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
sparsity_opt=True,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
# b_size = link_mat_out.shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
if grad_out.is_contiguous():
|
||||
grad_out = grad_out.view(-1)
|
||||
else:
|
||||
grad_out = grad_out.reshape(-1)
|
||||
# create backward tensors?
|
||||
|
||||
r = kinematics_fused_cu.backward(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
sparsity_opt,
|
||||
True,
|
||||
)
|
||||
out_q = r[0] # .view(angle.shape)
|
||||
out_q = out_q.view(b_size, -1)
|
||||
return out_q
|
||||
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
# link_mats: torch.Tensor,
|
||||
link_pos: torch.Tensor,
|
||||
link_quat: torch.Tensor,
|
||||
b_robot_spheres: torch.tensor,
|
||||
global_cumul_mat: torch.Tensor,
|
||||
joint_seq: torch.Tensor,
|
||||
fixed_transform: torch.tensor,
|
||||
robot_spheres: torch.tensor,
|
||||
link_map: torch.tensor,
|
||||
joint_map: torch.Tensor,
|
||||
joint_map_type: torch.Tensor,
|
||||
store_link_map: torch.Tensor,
|
||||
link_sphere_map: torch.Tensor,
|
||||
link_chain_map: torch.Tensor,
|
||||
grad_out: torch.Tensor,
|
||||
):
|
||||
if joint_seq.is_contiguous():
|
||||
joint_seq = joint_seq.view(-1)
|
||||
else:
|
||||
joint_seq = joint_seq.reshape(-1)
|
||||
(
|
||||
link_pos,
|
||||
link_quat,
|
||||
b_robot_spheres,
|
||||
global_cumul_mat,
|
||||
) = KinematicsFusedGlobalCumulFunction._call_cuda(
|
||||
link_pos,
|
||||
link_quat,
|
||||
b_robot_spheres,
|
||||
global_cumul_mat,
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
)
|
||||
ctx.save_for_backward(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
)
|
||||
|
||||
return link_pos, link_quat, b_robot_spheres
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
|
||||
grad_joint = None
|
||||
|
||||
if ctx.needs_input_grad[4]:
|
||||
(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
) = ctx.saved_tensors
|
||||
grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
|
||||
grad_out,
|
||||
grad_out_link_pos,
|
||||
grad_out_link_quat,
|
||||
grad_out_spheres,
|
||||
global_cumul_mat,
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
True,
|
||||
)
|
||||
|
||||
return (
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
grad_joint,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
def get_cuda_kinematics(
|
||||
link_pos_seq,
|
||||
link_quat_seq,
|
||||
batch_robot_spheres,
|
||||
global_cumul_mat,
|
||||
q_in,
|
||||
fixed_transform,
|
||||
link_spheres_tensor,
|
||||
link_map, # tells which link is attached to which link i
|
||||
joint_map, # tells which joint is attached to a link i
|
||||
joint_map_type, # joint type
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
grad_out_q,
|
||||
use_global_cumul: bool = True,
|
||||
):
|
||||
if use_global_cumul:
|
||||
link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
|
||||
link_pos_seq,
|
||||
link_quat_seq,
|
||||
batch_robot_spheres,
|
||||
global_cumul_mat,
|
||||
q_in,
|
||||
fixed_transform,
|
||||
link_spheres_tensor,
|
||||
link_map, # tells which link is attached to which link i
|
||||
joint_map, # tells which joint is attached to a link i
|
||||
joint_map_type, # joint type
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
grad_out_q,
|
||||
)
|
||||
else:
|
||||
link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
|
||||
link_pos_seq,
|
||||
link_quat_seq,
|
||||
batch_robot_spheres,
|
||||
global_cumul_mat,
|
||||
q_in,
|
||||
fixed_transform,
|
||||
link_spheres_tensor,
|
||||
link_map, # tells which link is attached to which link i
|
||||
joint_map, # tells which joint is attached to a link i
|
||||
joint_map_type, # joint type
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
grad_out_q,
|
||||
)
|
||||
return link_pos, link_quat, robot_spheres
|
||||
112
src/curobo/curobolib/ls.py
Normal file
112
src/curobo/curobolib/ls.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
# Third Party
|
||||
import torch
|
||||
|
||||
# CuRobo
|
||||
from curobo.util.logger import log_warn
|
||||
|
||||
try:
|
||||
# CuRobo
|
||||
from curobo.curobolib import line_search_cu
|
||||
except ImportError:
|
||||
log_warn("line_search_cu not found, JIT compiling...")
|
||||
|
||||
# Third Party
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
# CuRobo
|
||||
from curobo.curobolib.util_file import add_cpp_path
|
||||
|
||||
line_search_cu = load(
|
||||
name="line_search_cu",
|
||||
sources=add_cpp_path(
|
||||
[
|
||||
"line_search_cuda.cpp",
|
||||
"line_search_kernel.cu",
|
||||
"update_best_kernel.cu",
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def wolfe_line_search(
|
||||
# m_idx,
|
||||
best_x,
|
||||
best_c,
|
||||
best_grad,
|
||||
g_x,
|
||||
x_set,
|
||||
sv,
|
||||
c,
|
||||
c_idx,
|
||||
c_1: float,
|
||||
c_2: float,
|
||||
al,
|
||||
sw: bool,
|
||||
aw: bool,
|
||||
):
|
||||
batchsize = g_x.shape[0]
|
||||
l1 = g_x.shape[1]
|
||||
l2 = g_x.shape[2]
|
||||
r = line_search_cu.line_search(
|
||||
# m_idx,
|
||||
best_x,
|
||||
best_c,
|
||||
best_grad,
|
||||
g_x,
|
||||
x_set,
|
||||
sv,
|
||||
c,
|
||||
al,
|
||||
c_idx,
|
||||
c_1,
|
||||
c_2,
|
||||
sw,
|
||||
aw,
|
||||
l1,
|
||||
l2,
|
||||
batchsize,
|
||||
)
|
||||
# print("batchsize:" + str(batchsize))
|
||||
return (r[0], r[1], r[2])
|
||||
|
||||
|
||||
def update_best(
|
||||
best_cost,
|
||||
best_q,
|
||||
best_iteration,
|
||||
current_iteration,
|
||||
cost,
|
||||
q,
|
||||
d_opt: int,
|
||||
iteration: int,
|
||||
delta_threshold: float = 1e-5,
|
||||
relative_threshold: float = 0.999,
|
||||
):
|
||||
cost_s1 = cost.shape[0]
|
||||
cost_s2 = cost.shape[1]
|
||||
r = line_search_cu.update_best(
|
||||
best_cost,
|
||||
best_q,
|
||||
best_iteration,
|
||||
current_iteration,
|
||||
cost,
|
||||
q,
|
||||
d_opt,
|
||||
cost_s1,
|
||||
cost_s2,
|
||||
iteration,
|
||||
delta_threshold,
|
||||
relative_threshold,
|
||||
)
|
||||
# print("batchsize:" + str(batchsize))
|
||||
return (r[0], r[1], r[2]) # output: best_cost, best_q, best_iteration
|
||||
112
src/curobo/curobolib/opt.py
Normal file
112
src/curobo/curobolib/opt.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
# Third Party
|
||||
import torch
|
||||
from torch.autograd import Function
|
||||
|
||||
# CuRobo
|
||||
from curobo.util.logger import log_warn
|
||||
|
||||
try:
|
||||
# CuRobo
|
||||
from curobo.curobolib import lbfgs_step_cu
|
||||
except ImportError:
|
||||
log_warn("lbfgs_step_cu not found, JIT compiling...")
|
||||
# Third Party
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
# CuRobo
|
||||
from curobo.curobolib.util_file import add_cpp_path
|
||||
|
||||
lbfgs_step_cu = load(
|
||||
name="lbfgs_step_cu",
|
||||
sources=add_cpp_path(
|
||||
[
|
||||
"lbfgs_step_cuda.cpp",
|
||||
"lbfgs_step_kernel.cu",
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class LBFGScu(Function):
|
||||
@staticmethod
|
||||
def _call_cuda(
|
||||
step_vec,
|
||||
rho_buffer,
|
||||
y_buffer,
|
||||
s_buffer,
|
||||
q,
|
||||
grad_q,
|
||||
x_0,
|
||||
grad_0,
|
||||
epsilon=0.1,
|
||||
stable_mode=False,
|
||||
):
|
||||
m, b, v_dim, _ = y_buffer.shape
|
||||
|
||||
R = lbfgs_step_cu.forward(
|
||||
step_vec, # .view(-1),
|
||||
rho_buffer, # .view(-1),
|
||||
y_buffer, # .view(-1),
|
||||
s_buffer, # .view(-1),
|
||||
q,
|
||||
grad_q, # .view(-1),
|
||||
x_0,
|
||||
grad_0,
|
||||
epsilon,
|
||||
b,
|
||||
m,
|
||||
v_dim,
|
||||
stable_mode,
|
||||
)
|
||||
step_v = R[0].view(step_vec.shape)
|
||||
|
||||
return step_v
|
||||
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
step_vec,
|
||||
rho_buffer,
|
||||
y_buffer,
|
||||
s_buffer,
|
||||
q,
|
||||
grad_q,
|
||||
x_0,
|
||||
grad_0,
|
||||
epsilon=0.1,
|
||||
stable_mode=False,
|
||||
):
|
||||
R = LBFGScu._call_cuda(
|
||||
step_vec,
|
||||
rho_buffer,
|
||||
y_buffer,
|
||||
s_buffer,
|
||||
q,
|
||||
grad_q,
|
||||
x_0,
|
||||
grad_0,
|
||||
epsilon=epsilon,
|
||||
stable_mode=stable_mode,
|
||||
)
|
||||
# ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
|
||||
return R
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return (
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
195
src/curobo/curobolib/tensor_step.py
Normal file
195
src/curobo/curobolib/tensor_step.py
Normal file
@@ -0,0 +1,195 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
# Third Party
|
||||
import torch
|
||||
|
||||
# CuRobo
|
||||
from curobo.util.logger import log_warn
|
||||
|
||||
try:
|
||||
# CuRobo
|
||||
from curobo.curobolib import tensor_step_cu
|
||||
|
||||
except ImportError:
|
||||
# Third Party
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
# CuRobo
|
||||
from curobo.curobolib.util_file import add_cpp_path
|
||||
|
||||
log_warn("tensor_step_cu not found, jit compiling...")
|
||||
tensor_step_cu = load(
|
||||
name="tensor_step_cu",
|
||||
sources=add_cpp_path(["tensor_step_cuda.cpp", "tensor_step_kernel.cu"]),
|
||||
)
|
||||
|
||||
|
||||
def tensor_step_pos_clique_idx_fwd(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_position,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
start_idx,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
mode=-1,
|
||||
):
|
||||
r = tensor_step_cu.step_idx_position2(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_position,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
start_idx,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
mode,
|
||||
)
|
||||
return (r[0], r[1], r[2], r[3])
|
||||
|
||||
|
||||
def tensor_step_pos_clique_fwd(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_position,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
mode=-1,
|
||||
):
|
||||
r = tensor_step_cu.step_position2(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_position,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
mode,
|
||||
)
|
||||
return (r[0], r[1], r[2], r[3])
|
||||
|
||||
|
||||
def tensor_step_acc_fwd(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_acc,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
use_rk2=True,
|
||||
):
|
||||
r = tensor_step_cu.step_acceleration(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_acc,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
use_rk2,
|
||||
)
|
||||
return (r[0], r[1], r[2], r[3]) # output: best_cost, best_q, best_iteration
|
||||
|
||||
|
||||
def tensor_step_acc_idx_fwd(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_acc,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
start_idx,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
use_rk2=True,
|
||||
):
|
||||
r = tensor_step_cu.step_acceleration_idx(
|
||||
out_position,
|
||||
out_velocity,
|
||||
out_acceleration,
|
||||
out_jerk,
|
||||
u_acc,
|
||||
start_position,
|
||||
start_velocity,
|
||||
start_acceleration,
|
||||
start_idx,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
use_rk2,
|
||||
)
|
||||
return (r[0], r[1], r[2], r[3]) # output: best_cost, best_q, best_iteration
|
||||
|
||||
|
||||
def tensor_step_pos_clique_bwd(
|
||||
out_grad_position,
|
||||
grad_position,
|
||||
grad_velocity,
|
||||
grad_acceleration,
|
||||
grad_jerk,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
mode=-1,
|
||||
):
|
||||
r = tensor_step_cu.step_position_backward2(
|
||||
out_grad_position,
|
||||
grad_position,
|
||||
grad_velocity,
|
||||
grad_acceleration,
|
||||
grad_jerk,
|
||||
traj_dt,
|
||||
batch_size,
|
||||
horizon,
|
||||
dof,
|
||||
mode,
|
||||
)
|
||||
return r[0]
|
||||
34
src/curobo/curobolib/util_file.py
Normal file
34
src/curobo/curobolib/util_file.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#
|
||||
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
# property and proprietary rights in and to this material, related
|
||||
# documentation and any modifications thereto. Any use, reproduction,
|
||||
# disclosure or distribution of this material and related documentation
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
#
|
||||
|
||||
# Standard Library
|
||||
import os
|
||||
|
||||
|
||||
def get_cpp_path():
|
||||
path = os.path.dirname(__file__)
|
||||
return os.path.join(path, "cpp")
|
||||
|
||||
|
||||
def join_path(path1, path2):
|
||||
if isinstance(path2, str):
|
||||
return os.path.join(path1, path2)
|
||||
else:
|
||||
return path2
|
||||
|
||||
|
||||
def add_cpp_path(sources):
|
||||
cpp_path = get_cpp_path()
|
||||
new_list = []
|
||||
for s in sources:
|
||||
s = join_path(cpp_path, s)
|
||||
new_list.append(s)
|
||||
return new_list
|
||||
Reference in New Issue
Block a user