release repository

This commit is contained in:
Balakumar Sundaralingam
2023-10-26 04:17:19 -07:00
commit 07e6ccfc91
287 changed files with 70659 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
"""CuRoboLib Module."""

View File

@@ -0,0 +1,260 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <map>
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <vector>
// CUDA forward declarations
std::vector<torch::Tensor> self_collision_distance(
torch::Tensor out_distance, torch::Tensor out_vec,
torch::Tensor sparse_index,
const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
const torch::Tensor collision_offset, // n_spheres x n_spheres
const torch::Tensor weight,
const torch::Tensor collision_matrix, // n_spheres x n_spheres
const torch::Tensor thread_locations, const int locations_size,
const int batch_size, const int nspheres, const bool compute_grad = false,
const int ndpt = 8, // Does this need to match template?
const bool debug = false);
// CUDA forward declarations
std::vector<torch::Tensor> swept_sphere_obb_clpt(
const torch::Tensor sphere_position, // batch_size, 3
torch::Tensor distance, // batch_size, 1
torch::Tensor
closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
torch::Tensor sparsity_idx, const torch::Tensor weight,
const torch::Tensor activation_distance, const torch::Tensor speed_dt,
const torch::Tensor obb_accel, // n_boxes, 4, 4
const torch::Tensor obb_bounds, // n_boxes, 3
const torch::Tensor obb_pose, // n_boxes, 4, 4
const torch::Tensor obb_enable, // n_boxes, 4,
const torch::Tensor n_env_obb, // n_boxes, 4, 4
const torch::Tensor env_query_idx, // n_boxes, 4, 4
const int max_nobs, const int batch_size, const int horizon,
const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
const bool transform_back, const bool compute_distance,
const bool use_batch_env);
std::vector<torch::Tensor>
sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
torch::Tensor distance,
torch::Tensor closest_point, // batch size, 3
torch::Tensor sparsity_idx, const torch::Tensor weight,
const torch::Tensor activation_distance,
const torch::Tensor obb_accel, // n_boxes, 4, 4
const torch::Tensor obb_bounds, // n_boxes, 3
const torch::Tensor obb_pose, // n_boxes, 4, 4
const torch::Tensor obb_enable, // n_boxes, 4, 4
const torch::Tensor n_env_obb, // n_boxes, 4, 4
const torch::Tensor env_query_idx, // n_boxes, 4, 4
const int max_nobs, const int batch_size, const int horizon,
const int n_spheres, const bool transform_back,
const bool compute_distance, const bool use_batch_env);
std::vector<torch::Tensor> pose_distance(
torch::Tensor out_distance, torch::Tensor out_position_distance,
torch::Tensor out_rotation_distance,
torch::Tensor distance_p_vector, // batch size, 3
torch::Tensor distance_q_vector, // batch size, 4
torch::Tensor out_gidx,
const torch::Tensor current_position, // batch_size, 3
const torch::Tensor goal_position, // n_boxes, 3
const torch::Tensor current_quat, const torch::Tensor goal_quat,
const torch::Tensor vec_weight, // n_boxes, 4, 4
const torch::Tensor weight, // n_boxes, 4, 4
const torch::Tensor vec_convergence, const torch::Tensor run_weight,
const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
const int batch_size, const int horizon, const int mode,
const int num_goals = 1, const bool compute_grad = false,
const bool write_distance = true, const bool use_metric = false);
std::vector<torch::Tensor>
backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
const torch::Tensor grad_distance, // batch_size, 3
const torch::Tensor grad_p_distance, // n_boxes, 3
const torch::Tensor grad_q_distance,
const torch::Tensor pose_weight,
const torch::Tensor grad_p_vec, // n_boxes, 4, 4
const torch::Tensor grad_q_vec, const int batch_size,
const bool use_distance = false);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor> self_collision_distance_wrapper(
torch::Tensor out_distance, torch::Tensor out_vec,
torch::Tensor sparse_index,
const torch::Tensor robot_spheres, // batch_size x n_spheres x 4
const torch::Tensor collision_offset, // n_spheres
const torch::Tensor weight,
const torch::Tensor collision_matrix, // n_spheres
const torch::Tensor thread_locations, const int thread_locations_size,
const int batch_size, const int nspheres, const bool compute_grad = false,
const int ndpt = 8, const bool debug = false) {
CHECK_INPUT(out_distance);
CHECK_INPUT(out_vec);
CHECK_INPUT(robot_spheres);
CHECK_INPUT(collision_offset);
CHECK_INPUT(sparse_index);
CHECK_INPUT(weight);
CHECK_INPUT(thread_locations);
CHECK_INPUT(collision_matrix);
const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());
return self_collision_distance(
out_distance, out_vec, sparse_index, robot_spheres,
collision_offset, weight, collision_matrix, thread_locations,
thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
}
std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
const torch::Tensor sphere_position, // batch_size, 4
torch::Tensor distance,
torch::Tensor closest_point, // batch size, 3
torch::Tensor sparsity_idx, const torch::Tensor weight,
const torch::Tensor activation_distance,
const torch::Tensor obb_accel, // n_boxes, 4, 4
const torch::Tensor obb_bounds, // n_boxes, 3
const torch::Tensor obb_pose, // n_boxes, 4, 4
const torch::Tensor obb_enable, // n_boxes, 4, 4
const torch::Tensor n_env_obb, // n_boxes, 4, 4
const torch::Tensor env_query_idx, // n_boxes, 4, 4
const int max_nobs, const int batch_size, const int horizon,
const int n_spheres, const bool transform_back, const bool compute_distance,
const bool use_batch_env) {
const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
CHECK_INPUT(distance);
CHECK_INPUT(closest_point);
CHECK_INPUT(sphere_position);
CHECK_INPUT(sparsity_idx);
CHECK_INPUT(weight);
CHECK_INPUT(activation_distance);
CHECK_INPUT(obb_accel);
return sphere_obb_clpt(
sphere_position, distance, closest_point, sparsity_idx, weight,
activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
transform_back, compute_distance, use_batch_env);
}
std::vector<torch::Tensor> swept_sphere_obb_clpt_wrapper(
const torch::Tensor sphere_position, // batch_size, 4
torch::Tensor distance,
torch::Tensor closest_point, // batch size, 3
torch::Tensor sparsity_idx, const torch::Tensor weight,
const torch::Tensor activation_distance, const torch::Tensor speed_dt,
const torch::Tensor obb_accel, // n_boxes, 4, 4
const torch::Tensor obb_bounds, // n_boxes, 3
const torch::Tensor obb_pose, // n_boxes, 4, 4
const torch::Tensor obb_enable, // n_boxes, 4, 4
const torch::Tensor n_env_obb, // n_boxes, 4, 4
const torch::Tensor env_query_idx, // n_boxes, 4, 4
const int max_nobs, const int batch_size, const int horizon,
const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
const bool transform_back, const bool compute_distance,
const bool use_batch_env) {
const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
CHECK_INPUT(distance);
CHECK_INPUT(closest_point);
CHECK_INPUT(sphere_position);
return swept_sphere_obb_clpt(
sphere_position,
distance, closest_point, sparsity_idx, weight, activation_distance,
speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
enable_speed_metric, transform_back, compute_distance, use_batch_env);
}
std::vector<torch::Tensor> pose_distance_wrapper(
torch::Tensor out_distance, torch::Tensor out_position_distance,
torch::Tensor out_rotation_distance,
torch::Tensor distance_p_vector, // batch size, 3
torch::Tensor distance_q_vector, // batch size, 4
torch::Tensor out_gidx,
const torch::Tensor current_position, // batch_size, 3
const torch::Tensor goal_position, // n_boxes, 3
const torch::Tensor current_quat, const torch::Tensor goal_quat,
const torch::Tensor vec_weight, // n_boxes, 4, 4
const torch::Tensor weight, const torch::Tensor vec_convergence,
const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
const int mode, const int num_goals = 1, const bool compute_grad = false,
const bool write_distance = false, const bool use_metric = false) {
// at::cuda::DeviceGuard guard(angle.device());
CHECK_INPUT(out_distance);
CHECK_INPUT(out_position_distance);
CHECK_INPUT(out_rotation_distance);
CHECK_INPUT(distance_p_vector);
CHECK_INPUT(distance_q_vector);
CHECK_INPUT(current_position);
CHECK_INPUT(goal_position);
CHECK_INPUT(current_quat);
CHECK_INPUT(goal_quat);
CHECK_INPUT(batch_pose_idx);
const at::cuda::OptionalCUDAGuard guard(current_position.device());
return pose_distance(
out_distance, out_position_distance, out_rotation_distance,
distance_p_vector, distance_q_vector, out_gidx, current_position,
goal_position, current_quat, goal_quat, vec_weight, weight,
vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
horizon, mode, num_goals, compute_grad, write_distance, use_metric);
}
std::vector<torch::Tensor> backward_pose_distance_wrapper(
torch::Tensor out_grad_p, torch::Tensor out_grad_q,
const torch::Tensor grad_distance, // batch_size, 3
const torch::Tensor grad_p_distance, // n_boxes, 3
const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
const torch::Tensor grad_p_vec, // n_boxes, 4, 4
const torch::Tensor grad_q_vec, const int batch_size,
const bool use_distance) {
CHECK_INPUT(out_grad_p);
CHECK_INPUT(out_grad_q);
CHECK_INPUT(grad_distance);
CHECK_INPUT(grad_p_distance);
CHECK_INPUT(grad_q_distance);
const at::cuda::OptionalCUDAGuard guard(grad_distance.device());
return backward_pose_distance(
out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
m.def("pose_distance_backward", &backward_pose_distance_wrapper,
"Pose Distance Backward (curobolib)");
m.def("closest_point", &sphere_obb_clpt_wrapper,
"Closest Point OBB(curobolib)");
m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
"Swept Closest Point OBB(curobolib)");
m.def("self_collision_distance", &self_collision_distance_wrapper,
"Self Collision Distance (curobolib)");
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,106 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <vector>
// CUDA forward declarations
std::vector<torch::Tensor>
matrix_to_quaternion(torch::Tensor out_quat,
const torch::Tensor in_rot // batch_size, 3
);
std::vector<torch::Tensor> kin_fused_forward(
torch::Tensor link_pos, torch::Tensor link_quat,
torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
const torch::Tensor robot_spheres, const torch::Tensor link_map,
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
const int batch_size, const int n_spheres,
const bool use_global_cumul = false);
std::vector<torch::Tensor> kin_fused_backward_16t(
torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
const torch::Tensor link_map, const torch::Tensor joint_map,
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
const bool use_global_cumul = false);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor> kin_forward_wrapper(
torch::Tensor link_pos, torch::Tensor link_quat,
torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
const torch::Tensor robot_spheres, const torch::Tensor link_map,
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
const int batch_size, const int n_spheres,
const bool use_global_cumul = false) {
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
// TODO: add check input
return kin_fused_forward(
link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
}
std::vector<torch::Tensor> kin_backward_wrapper(
torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
const torch::Tensor link_map, const torch::Tensor joint_map,
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
const bool use_global_cumul = false) {
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
return kin_fused_backward_16t(
grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
joint_map, joint_map_type, store_link_map, link_sphere_map,
link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
}
std::vector<torch::Tensor>
matrix_to_quaternion_wrapper(torch::Tensor out_quat,
const torch::Tensor in_rot // batch_size, 3
) {
const at::cuda::OptionalCUDAGuard guard(in_rot.device());
CHECK_INPUT(in_rot);
CHECK_INPUT(out_quat);
return matrix_to_quaternion(out_quat, in_rot);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
"Rotation Matrix to Quaternion (CUDA)");
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,121 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <torch/extension.h>
#include <vector>
#include <c10/cuda/CUDAGuard.h>
// CUDA forward declarations
std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
torch::Tensor rho_buffer,
torch::Tensor sum, const int batch_size,
const int v_dim, const int m);
std::vector<torch::Tensor>
lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
torch::Tensor y_buffer, torch::Tensor s_buffer,
torch::Tensor grad_q, const float epsilon, const int batch_size,
const int m, const int v_dim);
std::vector<torch::Tensor>
lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
const int m, const int v_dim);
std::vector<torch::Tensor>
lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
const float epsilon, const int batch_size, const int m,
const int v_dim, const bool stable_mode);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor>
lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
torch::Tensor y_buffer, torch::Tensor s_buffer,
torch::Tensor grad_q, const float epsilon, const int batch_size,
const int m, const int v_dim) {
CHECK_INPUT(step_vec);
CHECK_INPUT(rho_buffer);
CHECK_INPUT(y_buffer);
CHECK_INPUT(s_buffer);
CHECK_INPUT(grad_q);
const at::cuda::OptionalCUDAGuard guard(grad_q.device());
return lbfgs_step_cuda(step_vec, rho_buffer, y_buffer, s_buffer, grad_q,
epsilon, batch_size, m, v_dim);
}
std::vector<torch::Tensor>
lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
const int m, const int v_dim) {
CHECK_INPUT(rho_buffer);
CHECK_INPUT(y_buffer);
CHECK_INPUT(s_buffer);
CHECK_INPUT(grad_q);
CHECK_INPUT(x_0);
CHECK_INPUT(grad_0);
CHECK_INPUT(q);
const at::cuda::OptionalCUDAGuard guard(grad_q.device());
return lbfgs_update_cuda(rho_buffer, y_buffer, s_buffer, q, grad_q, x_0,
grad_0, batch_size, m, v_dim);
}
std::vector<torch::Tensor>
reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
torch::Tensor rho_buffer, torch::Tensor sum,
const int batch_size, const int v_dim, const int m) {
const at::cuda::OptionalCUDAGuard guard(sum.device());
return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
}
std::vector<torch::Tensor>
lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
const float epsilon, const int batch_size, const int m,
const int v_dim, const bool stable_mode) {
CHECK_INPUT(step_vec);
CHECK_INPUT(rho_buffer);
CHECK_INPUT(y_buffer);
CHECK_INPUT(s_buffer);
CHECK_INPUT(grad_q);
CHECK_INPUT(x_0);
CHECK_INPUT(grad_0);
CHECK_INPUT(q);
const at::cuda::OptionalCUDAGuard guard(grad_q.device());
return lbfgs_cuda_fuse(step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q,
x_0, grad_0, epsilon, batch_size, m, v_dim,
stable_mode);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)");
m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)");
m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)");
m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug");
}

View File

@@ -0,0 +1,869 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <cuda.h>
#include <torch/extension.h>
#include <vector>
#include <c10/cuda/CUDAStream.h>
#include <cuda_fp16.h>
// #include "helper_cuda.h"
#include "helper_math.h"
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <math.h>
// #include <stdio.h>
//
// For the CUDA runtime routines (prefixed with "cuda_")
// #include <cuda_runtime.h>
// #include <cuda_fp16.h>
// #include <helper_cuda.h>
#define M_MAX 512
#define HALF_MAX 65504.0
#define M 15
#define VDIM 175 // 25 * 7,
#define FULL_MASK 0xffffffff
namespace Curobo {
namespace Optimization {
template <typename scalar_t>
__device__ __forceinline__ void
scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out,
const int v_dim) {
for (int i = 0; i < v_dim; i++) {
out[i] = a * b[i];
}
}
template <typename scalar_t>
__device__ __forceinline__ void
m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out,
const int v_dim, const int m) {
for (int j = 0; j < m; j++) {
for (int i = 0; i < v_dim; i++) {
out[j * v_dim + i] = a[j] * b[j * v_dim + i];
}
}
}
template <typename scalar_t>
__device__ __forceinline__ void vec_vec_dot(const scalar_t *a,
const scalar_t *b, scalar_t &out,
const int v_dim) {
for (int i = 0; i < v_dim; i++) {
out += a[i] * b[i];
}
}
template <typename scalar_t>
__device__ __forceinline__ void update_r(const scalar_t *rho_y,
const scalar_t *s_buffer, scalar_t *r,
scalar_t &alpha, const int v_dim) {
// dot product: and subtract with alpha
for (int i = 0; i < v_dim; i++) {
alpha -= rho_y[i] * r[i];
}
// scalar vector product:
for (int i = 0; i < v_dim; i++) {
r[i] = r[i] + alpha * s_buffer[i];
}
}
template <typename scalar_t>
__device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq,
const scalar_t alpha,
const int v_dim) {
//
for (int i = 0; i < v_dim; i++) {
gq[i] = gq[i] - (alpha * y_buffer[i]);
}
}
template <typename scalar_t>
__global__ void
lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer,
const scalar_t *y_buffer, const scalar_t *s_buffer,
const scalar_t *grad_q, const float epsilon,
const int batchSize, const int m, const int v_dim) {
// each thread writes one sphere of some link
const int t = blockDim.x * blockIdx.x + threadIdx.x; // batch
const int b_idx = t;
if (t >= (batchSize)) {
return;
}
// get thread start address:
const int b_start_scalar_adrs = b_idx * m;
const int b_start_vec_adrs = b_idx * m * v_dim;
const int b_step_start_adrs = b_idx * v_dim;
scalar_t rho_s[M * VDIM];
// copy floats to local buffer?
// y_buffer, s_buffer, rho_buffer
// compute rho_s
scalar_t loc_ybuf[M * VDIM];
scalar_t loc_sbuf[M * VDIM];
scalar_t loc_rho[M];
scalar_t gq[VDIM]; //, l_q[VDIM];
scalar_t alpha_buffer[M];
scalar_t t_1, t_2;
for (int i = 0; i < m * v_dim; i++) {
loc_ybuf[i] = y_buffer[b_start_vec_adrs + i];
loc_sbuf[i] = s_buffer[b_start_vec_adrs + i];
}
for (int i = 0; i < v_dim; i++) {
gq[i] = grad_q[b_step_start_adrs + i];
}
for (int i = 0; i < m; i++) {
loc_rho[i] = rho_buffer[b_start_scalar_adrs + i];
}
m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m);
// for loop over m
for (int i = m - 1; i > m - 2; i--) {
// l_start_vec_adrs = i * v_dim;
// scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim],
// v_dim);
vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim);
update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim);
}
// compute initial hessian:
vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1,
v_dim);
vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2,
v_dim);
t_1 = t_1 / t_2;
if (t_1 < 0) {
t_1 = 0;
}
t_1 += epsilon;
scalar_vec_product(t_1, &gq[0], &gq[0], v_dim);
m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m);
for (int i = 0; i < m; i++) {
// scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim],
// v_dim);
update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i],
v_dim);
}
// write gq to out grad:
for (int i = 0; i < v_dim; i++) {
step_vec[b_step_start_adrs + i] = -1.0 * gq[i];
}
}
template <typename psum_t>
__forceinline__ __device__ psum_t warpReduce(psum_t v, int elems,
unsigned mask) {
psum_t val = v;
int shift = 1;
for (int i = elems; i > 1; i /= 2) {
val += __shfl_down_sync(mask, val, shift);
shift *= 2;
}
// val += __shfl_down_sync(mask, val, 1); // i=32
// val += __shfl_down_sync(mask, val, 2); // i=16
// val += __shfl_down_sync(mask, val, 4); // i=8
// val += __shfl_down_sync(mask, val, 8); // i=4
// val += __shfl_down_sync(mask, val, 16); // i=2
return val;
}
template <typename scalar_t, typename psum_t>
__forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data,
scalar_t *result) {
psum_t val = v;
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
val += __shfl_down_sync(mask, val, 1);
val += __shfl_down_sync(mask, val, 2);
val += __shfl_down_sync(mask, val, 4);
val += __shfl_down_sync(mask, val, 8);
val += __shfl_down_sync(mask, val, 16);
// int leader = __ffs(mask) 1; // select a leader lane
int leader = 0;
if (threadIdx.x % 32 == leader) {
if (m < 32) {
result[0] = (scalar_t)val;
} else {
data[(threadIdx.x + 1) / 32] = val;
}
}
if (m >= 32) {
__syncthreads();
int elems = (m + 31) / 32;
unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
if (threadIdx.x / 32 == 0) { // only the first warp will do this work
psum_t val2 = data[threadIdx.x % 32];
int shift = 1;
for (int i = elems - 1; i > 0; i /= 2) {
val2 += __shfl_down_sync(mask2, val2, shift);
shift *= 2;
}
// int leader = __ffs(mask2) 1; // select a leader lane
int leader = 0;
if (threadIdx.x % 32 == leader) {
result[0] = (scalar_t)val2;
}
}
}
__syncthreads();
}
// blockReduce
template <typename scalar_t, typename psum_t>
__forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data,
scalar_t *result) {
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
psum_t val = warpReduce(v, 32, mask);
// int leader = __ffs(mask) 1; // select a leader lane
int leader = 0;
if (threadIdx.x % 32 == leader) {
data[(threadIdx.x + 1) / 32] = val;
}
if (m >= 32) {
__syncthreads();
int elems = (m + 31) / 32;
unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
if (threadIdx.x / 32 == 0) { // only the first warp will do this work
psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2);
// // int leader = __ffs(mask2) 1; // select a leader lane
if (threadIdx.x == leader) {
result[0] = (scalar_t)val2;
}
}
} else {
if (threadIdx.x == leader) {
result[0] = (scalar_t)val;
}
}
__syncthreads();
}
template <typename scalar_t, typename psum_t>
__inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2,
const int m, psum_t *data, scalar_t *result) {
scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x];
reduce(val, m, data, result);
}
template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
if (var < 0)
return 0;
else
return var;
}
//////////////////////////////////////////////////////////
// one block per batch
// v_dim threads per block
//////////////////////////////////////////////////////////
template <typename scalar_t, typename psum_t>
__global__ void lbfgs_step_kernel(scalar_t *step_vec, // b x 175
scalar_t *rho_buffer, // m x b x 1
const scalar_t *y_buffer, // m x b x 175
const scalar_t *s_buffer, // m x b x 175
const scalar_t *grad_q, // b x 175
const float epsilon, const int batchsize,
const int m, const int v_dim) {
__shared__ psum_t
data[32]; // temporary buffer needed for block-wide reduction
__shared__ scalar_t
result; // result of the reduction or vector-vector dot product
__shared__ scalar_t gq[175]; /// gq = batch * v_dim
assert(v_dim < 176);
int batch = blockIdx.x; // one block per batch
if (threadIdx.x >= v_dim)
return;
gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
scalar_t alpha_buffer[16];
// assert(m<16); // allocating a buffer assuming m < 16
for (int i = m - 1; i > -1; i--) {
dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
&data[0], &result);
alpha_buffer[i] = result * rho_buffer[i * batchsize + batch];
gq[threadIdx.x] =
gq[threadIdx.x] -
alpha_buffer[i] *
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
}
// compute var1
scalar_t val1 =
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
scalar_t val2 =
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
reduce(val1 * val1, v_dim, data, &result);
scalar_t denominator = result;
reduce(val1 * val2, v_dim, data, &result);
scalar_t numerator = result;
scalar_t var1 = numerator / denominator;
scalar_t gamma = relu(var1) + epsilon; // epsilon
gq[threadIdx.x] = gamma * gq[threadIdx.x];
for (int i = 0; i < m; i++) {
dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
&data[0], &result);
gq[threadIdx.x] =
gq[threadIdx.x] +
(alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) *
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
}
step_vec[batch * v_dim + threadIdx.x] =
-1 * gq[threadIdx.x]; // copy from shared memory to global memory
}
template <typename scalar_t, typename psum_t>
__global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer, // m x b x 1
scalar_t *y_buffer, // m x b x 175
scalar_t *s_buffer, // m x b x 175
scalar_t *q, // b x 175
scalar_t *x_0, // b x 175
scalar_t *grad_0, // b x 175
const scalar_t *grad_q, // b x 175
const int batchsize, const int m,
const int v_dim) {
__shared__ psum_t
data[32]; // temporary buffer needed for block-wide reduction
__shared__ scalar_t
result; // result of the reduction or vector-vector dot product
// __shared__ scalar_t y[175]; // temporary shared memory storage
// __shared__ scalar_t s[175]; // temporary shared memory storage
assert(v_dim <= VDIM);
int batch = blockIdx.x; // one block per batch
if (threadIdx.x >= v_dim)
return;
scalar_t y =
grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x];
scalar_t s =
q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
reduce(y * s, v_dim, &data[0], &result);
for (int i = 1; i < m; i++) {
s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
}
// __syncthreads();
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
grad_0[batch * v_dim + threadIdx.x] = grad_q[batch * v_dim + threadIdx.x];
x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
if (threadIdx.x == 0) {
scalar_t rho = 1 / result;
for (int i = 1; i < m; i++) {
rho_buffer[(i - 1) * batchsize + batch] =
rho_buffer[i * batchsize + batch];
}
rho_buffer[(m - 1) * batchsize + batch] = rho;
}
}
template <typename scalar_t, typename psum_t>
__global__ void reduce_kernel(
scalar_t *vec1, // b x 175
scalar_t *vec2, // b x 175
scalar_t *rho_buffer, // m x b x 1
scalar_t *sum_out, // m x b x 1
const int batchsize, const int m,
const int v_dim) // s_buffer and y_buffer are not rolled by default
{
__shared__ psum_t
data[32]; // temporary buffer needed for block-wide reduction
__shared__ scalar_t
result; // result of the reduction or vector-vector dot product
int batch = blockIdx.x; // one block per batch
if (threadIdx.x >= v_dim)
return;
////////////////////
// update_buffer
////////////////////
scalar_t y = vec1[batch * v_dim + threadIdx.x];
scalar_t s = vec2[batch * v_dim + threadIdx.x];
reduce(y * s, v_dim, &data[0], &result);
scalar_t numerator = result;
if (threadIdx.x == 0) {
sum_out[batch] = 1 / numerator;
}
// return;
if (threadIdx.x < m - 1) {
// m thread participate to shif the values
// this is safe as m<32 and this happens in lockstep
rho_buffer[threadIdx.x * batchsize + batch] =
rho_buffer[(threadIdx.x + 1) * batchsize + batch];
} else if (threadIdx.x == m - 1) {
scalar_t rho = (1 / numerator);
rho_buffer[threadIdx.x * batchsize + batch] = rho;
}
}
template <typename scalar_t, typename psum_t>
__global__ void lbfgs_update_buffer_and_step_v1(
scalar_t *step_vec, // b x 175
scalar_t *rho_buffer, // m x b x 1
scalar_t *y_buffer, // m x b x 175
scalar_t *s_buffer, // m x b x 175
scalar_t *q, // b x 175
scalar_t *x_0, // b x 175
scalar_t *grad_0, // b x 175
const scalar_t *grad_q, // b x 175
const float epsilon, const int batchsize, const int m, const int v_dim,
const bool rolled_ys = false,
const bool stable_mode =
false) // s_buffer and y_buffer are not rolled by default
{
// extern __shared__ scalar_t alpha_buffer_sh[];
extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
scalar_t *my_smem_rc = reinterpret_cast<scalar_t *>(my_smem);
scalar_t *alpha_buffer_sh = &my_smem_rc[0]; // m*blockDim.x
scalar_t *rho_buffer_sh = &my_smem_rc[m * blockDim.x]; // batchsize*m
scalar_t *s_buffer_sh =
&my_smem_rc[m * blockDim.x + m * batchsize]; // m*blockDim.x
scalar_t *y_buffer_sh =
&my_smem_rc[2 * m * blockDim.x + m * batchsize]; // m*blockDim.x
__shared__ psum_t
data[32]; // temporary buffer needed for block-wide reduction
__shared__ scalar_t
result; // result of the reduction or vector-vector dot product
int batch = blockIdx.x; // one block per batch
if (threadIdx.x >= v_dim)
return;
scalar_t gq;
gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
////////////////////
// update_buffer
////////////////////
scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
// if y is close to zero
scalar_t s =
q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
reduce_v1(y * s, v_dim, &data[0], &result);
scalar_t numerator = result;
// scalar_t rho = 1.0/numerator;
if (!rolled_ys) {
#pragma unroll
for (int i = 1; i < m; i++) {
scalar_t st =
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
scalar_t yt =
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st;
y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt;
s_buffer_sh[m * threadIdx.x + i - 1] = st;
y_buffer_sh[m * threadIdx.x + i - 1] = yt;
}
}
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
s_buffer_sh[m * threadIdx.x + m - 1] = s;
y_buffer_sh[m * threadIdx.x + m - 1] = y;
grad_0[batch * v_dim + threadIdx.x] = gq;
x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
if (threadIdx.x < m - 1) {
// m thread participate to shif the values
// this is safe as m<32 and this happens in lockstep
scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch];
rho_buffer[threadIdx.x * batchsize + batch] = rho;
rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
}
if (threadIdx.x == m - 1) {
scalar_t rho = 1.0 / numerator;
// if this is nan, make it zero:
if (stable_mode && numerator == 0.0) {
rho = 0.0;
}
rho_buffer[threadIdx.x * batchsize + batch] = rho;
rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
}
// return;
__syncthreads();
////////////////////
// step
////////////////////
// scalar_t alpha_buffer[16];
// assert(m<16); // allocating a buffer assuming m < 16
#pragma unroll
for (int i = m - 1; i > -1; i--) {
// reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
// v_dim, &data[0], &result);
reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
alpha_buffer_sh[threadIdx.x * m + i] =
result * rho_buffer_sh[i * batchsize + batch];
// gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim +
// batch*v_dim + threadIdx.x];
gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
y_buffer_sh[m * threadIdx.x + i];
}
// compute var1
reduce_v1(y * y, v_dim, data, &result);
scalar_t denominator = result;
// reduce(s*y, v_dim, data, &result); // redundant - already computed it above
// scalar_t numerator = result;
scalar_t var1 = numerator / denominator;
// To improve stability, uncomment below line: [this however leads to poor
// convergence]
if (stable_mode && denominator == 0.0) {
var1 = epsilon;
}
scalar_t gamma = relu(var1);
gq = gamma * gq;
#pragma unroll
for (int i = 0; i < m; i++) {
// reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
// v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] -
// result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim +
// batch*v_dim + threadIdx.x];
reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
result * rho_buffer_sh[i * batchsize + batch]) *
s_buffer_sh[m * threadIdx.x + i];
}
step_vec[batch * v_dim + threadIdx.x] =
-1.0 * gq; // copy from shared memory to global memory
}
// (32/M) rolls per warp
// Threads in a warp in a GPU execute in a lock-step. We leverage that to do
// the roll without using temporary storage or explicit synchronization.
template <typename scalar_t>
__global__ void lbfgs_roll(scalar_t *a, // m x b x 175
scalar_t *b, // m x b x 175
const int m_t, const int batchsize, const int m,
const int v_dim) {
assert(m_t <= 32);
int t = blockDim.x * blockIdx.x + threadIdx.x;
if (t >= m_t * v_dim * batchsize)
return;
int _m = t % m_t;
int _v_dim = (t / m_t) % v_dim;
int batch = t / (m * v_dim); // this line could be wrong?
if (_m < m - 1) {
a[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
b[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
}
}
template <typename scalar_t, typename psum_t, bool rolled_ys>
__global__ void lbfgs_update_buffer_and_step(
scalar_t *step_vec, // b x 175
scalar_t *rho_buffer, // m x b x 1
scalar_t *y_buffer, // m x b x 175
scalar_t *s_buffer, // m x b x 175
scalar_t *q, // b x 175
scalar_t *x_0, // b x 175
scalar_t *grad_0, // b x 175
const scalar_t *grad_q, // b x 175
const float epsilon, const int batchsize, const int m, const int v_dim,
const bool stable_mode =
false) // s_buffer and y_buffer are not rolled by default
{
// extern __shared__ scalar_t alpha_buffer_sh[];
extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
scalar_t *alpha_buffer_sh = reinterpret_cast<scalar_t *>(my_smem);
__shared__ psum_t
data[32]; // temporary buffer needed for block-wide reduction
__shared__ scalar_t
result; // result of the reduction or vector-vector dot product
int batch = blockIdx.x; // one block per batch
if (threadIdx.x >= v_dim)
return;
scalar_t gq;
gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
////////////////////
// update_buffer
////////////////////
scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
// if y is close to zero
scalar_t s =
q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
reduce(y * s, v_dim, &data[0], &result);
scalar_t numerator = result;
// scalar_t rho = 1.0/numerator;
if (!rolled_ys) {
for (int i = 1; i < m; i++) {
s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
}
}
s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
grad_0[batch * v_dim + threadIdx.x] = gq;
x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
if (threadIdx.x < m - 1) {
// m thread participate to shif the values
// this is safe as m<32 and this happens in lockstep
rho_buffer[threadIdx.x * batchsize + batch] =
rho_buffer[(threadIdx.x + 1) * batchsize + batch];
}
if (threadIdx.x == m - 1) {
scalar_t rho = 1.0 / numerator;
// if this is nan, make it zero:
if (stable_mode && numerator == 0.0) {
rho = 0.0;
}
rho_buffer[threadIdx.x * batchsize + batch] = rho;
}
// return;
//__syncthreads();
////////////////////
// step
////////////////////
// scalar_t alpha_buffer[16];
// assert(m<16); // allocating a buffer assuming m < 16
#pragma unroll
for (int i = m - 1; i > -1; i--) {
reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
v_dim, &data[0], &result);
alpha_buffer_sh[threadIdx.x * m + i] =
result * rho_buffer[i * batchsize + batch];
gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
}
// compute var1
reduce(y * y, v_dim, data, &result);
scalar_t denominator = result;
// reduce(s*y, v_dim, data, &result); // redundant - already computed it above
// scalar_t numerator = result;
scalar_t var1 = numerator / denominator;
// To improve stability, uncomment below line: [this however leads to poor
// convergence]
if (stable_mode && denominator == 0.0) {
var1 = epsilon;
}
scalar_t gamma = relu(var1);
gq = gamma * gq;
#pragma unroll
for (int i = 0; i < m; i++) {
reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
v_dim, &data[0], &result);
gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
result * rho_buffer[i * batchsize + batch]) *
s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
}
step_vec[batch * v_dim + threadIdx.x] =
-1.0 * gq; // copy from shared memory to global memory
}
} // namespace Optimization
} // namespace Curobo
std::vector<torch::Tensor>
lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
torch::Tensor y_buffer, torch::Tensor s_buffer,
torch::Tensor grad_q, const float epsilon, const int batch_size,
const int m, const int v_dim) {
using namespace Curobo::Optimization;
const int threadsPerBlock = 128;
const int blocksPerGrid =
((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
// launch threads per batch:
// int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
// const int blocksPerGrid = batch_size; //((batch_size) + threadsPerBlock -
// 1) / threadsPerBlock;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(
step_vec.scalar_type(), "lbfgs_step_cu", ([&] {
lbfgs_step_kernel_old<scalar_t>
<<<blocksPerGrid, threadsPerBlock,
v_dim * sizeof(step_vec.scalar_type()), stream>>>(
step_vec.data_ptr<scalar_t>(), rho_buffer.data_ptr<scalar_t>(),
y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
grad_q.data_ptr<scalar_t>(), epsilon, batch_size, m, v_dim);
}));
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {step_vec};
}
std::vector<torch::Tensor>
lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
const int m, const int v_dim) {
using namespace Curobo::Optimization;
// const int threadsPerBlock = 128;
// launch threads per batch:
// int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
int threadsPerBlock = v_dim;
const int blocksPerGrid =
batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(
y_buffer.scalar_type(), "lbfgs_update_cu", ([&] {
lbfgs_update_buffer_kernel<scalar_t, scalar_t>
<<<blocksPerGrid, threadsPerBlock,
v_dim * sizeof(y_buffer.scalar_type()), stream>>>(
rho_buffer.data_ptr<scalar_t>(), y_buffer.data_ptr<scalar_t>(),
s_buffer.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
x_0.data_ptr<scalar_t>(), grad_0.data_ptr<scalar_t>(),
grad_q.data_ptr<scalar_t>(), batch_size, m, v_dim);
}));
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {rho_buffer, y_buffer, s_buffer, x_0, grad_0};
}
std::vector<torch::Tensor>
lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
const float epsilon, const int batch_size, const int m,
const int v_dim, const bool stable_mode) {
using namespace Curobo::Optimization;
// call first kernel:
int threadsPerBlock = v_dim;
assert(threadsPerBlock < 1024);
assert(m < M_MAX);
int blocksPerGrid =
batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int smemsize = 0;
if (true) {
AT_DISPATCH_FLOATING_TYPES(
y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] {
smemsize = m * threadsPerBlock * sizeof(scalar_t);
lbfgs_update_buffer_and_step<scalar_t, scalar_t, false>
<<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
step_vec.data_ptr<scalar_t>(),
rho_buffer.data_ptr<scalar_t>(),
y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
epsilon, batch_size, m, v_dim, stable_mode);
});
} else {
// v1 does not work
AT_DISPATCH_FLOATING_TYPES(
y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] {
smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) +
m * batch_size * sizeof(scalar_t);
lbfgs_update_buffer_and_step_v1<scalar_t, scalar_t>
<<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
step_vec.data_ptr<scalar_t>(),
rho_buffer.data_ptr<scalar_t>(),
y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
epsilon, batch_size, m, v_dim, false, stable_mode);
});
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0};
}
std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
torch::Tensor rho_buffer,
torch::Tensor sum, const int batch_size,
const int m, const int v_dim) {
using namespace Curobo::Optimization;
int threadsPerBlock = pow(2, ((int)log2(v_dim)) + 1);
int blocksPerGrid =
batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
// printf("threadsPerBlock:%d, blocksPerGrid: %d\n",
// threadsPerBlock, blocksPerGrid);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(
vec.scalar_type(), "reduce_cu", ([&] {
reduce_kernel<scalar_t, scalar_t>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
vec.data_ptr<scalar_t>(), vec2.data_ptr<scalar_t>(),
rho_buffer.data_ptr<scalar_t>(), sum.data_ptr<scalar_t>(),
batch_size, m, v_dim);
}));
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {sum, rho_buffer};
}

View File

@@ -0,0 +1,103 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <torch/extension.h>
#include <vector>
#include <c10/cuda/CUDAGuard.h>
// CUDA forward declarations
std::vector<torch::Tensor>
update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
torch::Tensor best_iteration,
torch::Tensor current_iteration,
const torch::Tensor cost,
const torch::Tensor q, const int d_opt, const int cost_s1,
const int cost_s2, const int iteration,
const float delta_threshold,
const float relative_threshold = 0.999);
std::vector<torch::Tensor> line_search_cuda(
// torch::Tensor m,
torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
const torch::Tensor g_x, const torch::Tensor x_set,
const torch::Tensor step_vec, const torch::Tensor c_0,
const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
const float c_2, const bool strong_wolfe, const bool approx_wolfe,
const int l1, const int l2, const int batchsize);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor> line_search_call(
// torch::Tensor m,
torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
const torch::Tensor g_x, const torch::Tensor x_set,
const torch::Tensor step_vec, const torch::Tensor c_0,
const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
const float c_2, const bool strong_wolfe, const bool approx_wolfe,
const int l1, const int l2, const int batchsize) {
CHECK_INPUT(g_x);
CHECK_INPUT(x_set);
CHECK_INPUT(step_vec);
CHECK_INPUT(c_0);
CHECK_INPUT(alpha_list);
CHECK_INPUT(c_idx);
// CHECK_INPUT(m);
CHECK_INPUT(best_x);
CHECK_INPUT(best_c);
CHECK_INPUT(best_grad);
const at::cuda::OptionalCUDAGuard guard(best_x.device());
// return line_search_cuda(m, g_x, step_vec, c_0, alpha_list, c_1, c_2,
// strong_wolfe, l1, l2, batchsize);
return line_search_cuda(best_x, best_c, best_grad, g_x, x_set, step_vec, c_0,
alpha_list, c_idx, c_1, c_2, strong_wolfe,
approx_wolfe, l1, l2, batchsize);
}
std::vector<torch::Tensor>
update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
torch::Tensor best_iteration,
torch::Tensor current_iteration,
const torch::Tensor cost,
const torch::Tensor q, const int d_opt, const int cost_s1,
const int cost_s2, const int iteration,
const float delta_threshold,
const float relative_threshold=0.999) {
CHECK_INPUT(best_cost);
CHECK_INPUT(best_q);
CHECK_INPUT(cost);
CHECK_INPUT(q);
CHECK_INPUT(current_iteration);
CHECK_INPUT(best_iteration);
const at::cuda::OptionalCUDAGuard guard(cost.device());
return update_best_cuda(best_cost, best_q, best_iteration, current_iteration, cost, q, d_opt,
cost_s1, cost_s2, iteration, delta_threshold, relative_threshold);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("update_best", &update_best_call, "Update Best (CUDA)");
m.def("line_search", &line_search_call, "Line search (CUDA)");
}

View File

@@ -0,0 +1,398 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#pragma once
#include <cuda.h>
#include <torch/extension.h>
#include <vector>
#include <c10/cuda/CUDAStream.h>
#include <cuda_fp16.h>
// #include "helper_cuda.h"
#include "helper_math.h"
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cub/cub.cuh>
#include <math.h>
// #include <stdio.h>
//
// For the CUDA runtime routines (prefixed with "cuda_")
// #include <cuda_runtime.h>
// #include <cuda_fp16.h>
// #include <helper_cuda.h>
#define FULL_MASK 0xffffffff
namespace Curobo {
namespace Optimization {
template <typename scalar_t, typename psum_t>
__inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
psum_t *data, scalar_t *result) {
psum_t val = v;
val += __shfl_down_sync(mask, val, 1);
val += __shfl_down_sync(mask, val, 2);
val += __shfl_down_sync(mask, val, 4);
val += __shfl_down_sync(mask, val, 8);
val += __shfl_down_sync(mask, val, 16);
// int leader = __ffs(mask) 1; // select a leader lane
int leader = 0;
if (threadIdx.x % 32 == leader) {
if (m <= 32) {
result[0] = (scalar_t)val;
} else {
data[(threadIdx.x + 1) / 32] = val;
}
}
if (m > 32) {
__syncthreads();
int elems = (m + 31) / 32;
assert(elems <= 32);
unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
if (threadIdx.x < elems) { // only the first warp will do this work
psum_t val2 = data[threadIdx.x % 32];
int shift = 1;
for (int i = elems - 1; i > 0; i /= 2) {
val2 += __shfl_down_sync(mask2, val2, shift);
shift *= 2;
}
// int leader = __ffs(mask2) 1; // select a leader lane
int leader = 0;
if (threadIdx.x % 32 == leader) {
result[0] = (scalar_t)val2;
}
}
}
__syncthreads();
}
// Launched with l2 threads/block and batchsize blocks
template <typename scalar_t, typename psum_t>
__global__ void line_search_kernel(
// int64_t *m_idx, // 4x1x1
scalar_t *best_x, // 4x280
scalar_t *best_c, // 4x1
scalar_t *best_grad, // 4x280
const scalar_t *g_x, // 4x6x280
const scalar_t *x_set, // 4x6x280
const scalar_t *step_vec, // 4x280x1
const scalar_t *c, // 4x6x1
const scalar_t *alpha_list, // 4x6x1
const int64_t *c_idx, // 4x1x1
const float c_1, const float c_2, const bool strong_wolfe,
const bool approx_wolfe,
const int l1, // 6
const int l2, // 280
const int batchsize) // 4
{
int batch = blockIdx.x;
__shared__ psum_t data[32];
__shared__ scalar_t result[32];
assert(l1 <= 32);
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
if (threadIdx.x >= l2) {
return;
}
scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
// g_step = g0 @ step_vec_T
// g_x @ step_vec_T
for (int i = 0; i < l1; i++) {
reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
&data[0], &result[i]);
}
__shared__ scalar_t step_success[32];
__shared__ scalar_t step_success_w1[32];
assert(blockDim.x >= l1);
bool wolfe_1 = false;
bool wolfe = false;
bool condition = threadIdx.x < l1;
if (condition) {
// scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
scalar_t alpha_list_elem = alpha_list[threadIdx.x];
// condition 1:
wolfe_1 = c[batch * l1 + threadIdx.x] <=
(c[batch * l1] + c_1 * alpha_list_elem * result[0]);
// condition 2:
bool wolfe_2;
if (strong_wolfe) {
wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
} else {
wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
}
wolfe = wolfe_1 & wolfe_2;
step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
}
__syncthreads();
__shared__ int idx_shared;
if (threadIdx.x == 0) {
int m_id = 0;
int m1_id = 0;
scalar_t max1 = step_success[0];
scalar_t max2 = step_success_w1[0];
for (int i = 1; i < l1; i++) {
if (max1 < step_success[i]) {
max1 = step_success[i];
m_id = i;
}
if (max2 < step_success_w1[i]) {
max2 = step_success_w1[i];
m1_id = i;
}
}
if (!approx_wolfe) {
// m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
if (m_id == 0) {
m_id = m1_id;
}
// m_idx[m_idx == 0] = 1
if (m_id == 0) {
m_id = 1;
}
}
idx_shared = m_id + c_idx[batch];
}
////////////////////////////////////
// write outputs using the computed index.
// one index per batch is computed
////////////////////////////////////
// l2 is d_opt, l1 is line_search n.
// idx_shared contains index in l1
//
__syncthreads();
if (threadIdx.x < l2) {
if (threadIdx.x == 0) {
// printf("block: %d, idx_shared: %d\n", batch, idx_shared);
}
best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
}
if (threadIdx.x == 0) {
best_c[batch] = c[idx_shared];
}
}
// Launched with l2 threads/block and #blocks = batchsize
template <typename scalar_t, typename psum_t>
__global__ void line_search_kernel_mask(
// int64_t *m_idx, // 4x1x1
scalar_t *best_x, // 4x280
scalar_t *best_c, // 4x1
scalar_t *best_grad, // 4x280
const scalar_t *g_x, // 4x6x280
const scalar_t *x_set, // 4x6x280
const scalar_t *step_vec, // 4x280x1
const scalar_t *c, // 4x6x1
const scalar_t *alpha_list, // 4x6x1
const int64_t *c_idx, // 4x1x1
const float c_1, const float c_2, const bool strong_wolfe,
const bool approx_wolfe,
const int l1, // 6
const int l2, // 280
const int batchsize) // 4
{
int batch = blockIdx.x;
__shared__ psum_t data[32];
__shared__ scalar_t result[32];
assert(l1 <= 32);
unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
if (threadIdx.x >= l2) {
return;
}
scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
// g_step = g0 @ step_vec_T
// g_x @ step_vec_T
for (int i = 0; i < l1; i++) {
reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
&data[0], &result[i]);
}
// __shared__ scalar_t step_success[32];
// __shared__ scalar_t step_success_w1[32];
assert(blockDim.x >= l1);
bool wolfe_1 = false;
bool wolfe = false;
bool condition = threadIdx.x < l1;
if (condition) {
scalar_t alpha_list_elem = alpha_list[threadIdx.x];
// scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
// condition 1:
wolfe_1 = c[batch * l1 + threadIdx.x] <=
(c[batch * l1] + c_1 * alpha_list_elem * result[0]);
// condition 2:
bool wolfe_2;
if (strong_wolfe) {
wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
} else {
wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
}
// wolfe = torch.logical_and(wolfe_1, wolfe_2)
wolfe = wolfe_1 & wolfe_2;
// // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
// // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
// step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
// step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
}
unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
unsigned msk = __ballot_sync(FULL_MASK, wolfe & condition);
// get the index of the last occurance of true
unsigned msk1_brev = __brev(msk1);
unsigned msk_brev = __brev(msk);
int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
int id = 32 - __ffs(msk_brev); // position of least signficant bit set to 1
__syncthreads();
__shared__ int idx_shared;
if (threadIdx.x == 0) {
if (!approx_wolfe) {
if (id == 32) { // msk is zero
id = id1;
}
if (id == 0) { // bit 0 is set
id = id1;
}
if (id == 32) { // msk is zero
id = 1;
}
if (id == 0) {
id = 1;
}
} else {
if (id == 32) { // msk is zero
id = 0;
}
}
// // _, m_idx = torch.max(step_success, dim=-2)
// // _, m1_idx = torch.max(step_success_w1, dim=-2)
// int m_id = 0;
// int m1_id = 0;
// scalar_t max1 = step_success[0];
// scalar_t max2 = step_success_w1[0];
// for (int i=1; i<l1; i++) {
// if (max1<step_success[i]) {
// max1 = step_success[i];
// m_id = i;
// }
// if (max2<step_success_w1[i]) {
// max2 = step_success_w1[i];
// m1_id = i;
// }
// }
// // m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
// if (m_id == 0) {
// m_id = m1_id;
// }
// // m_idx[m_idx == 0] = 1
// if (m_id == 0) {
// m_id = 1;
// }
// if (id != m_id) {
// printf("id=%d, m_id=%d\n", id, m_id);
// printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
// 32-__ffs(msk1_brev), 32-__ffs(msk_brev));
// }
// m_idx[batch] = m_id;
// m_idx[batch] = id;
idx_shared = id + c_idx[batch];
}
////////////////////////////////////
// write outputs using the computed index.
// one index per batch is computed
////////////////////////////////////
__syncthreads();
if (threadIdx.x < l2) {
if (threadIdx.x == 0) {
// printf("block: %d, idx_shared: %d\n", batch, idx_shared);
}
best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
}
if (threadIdx.x == 0) {
best_c[batch] = c[idx_shared];
}
}
} // namespace Optimization
} // namespace Curobo
std::vector<torch::Tensor> line_search_cuda(
// torch::Tensor m_idx,
torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
const torch::Tensor g_x, const torch::Tensor x_set,
const torch::Tensor step_vec, const torch::Tensor c_0,
const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
const float c_2, const bool strong_wolfe, const bool approx_wolfe,
const int l1, const int l2, const int batchsize) {
using namespace Curobo::Optimization;
assert(l2 <= 1024);
// multiple of 32
const int threadsPerBlock = 32 * ((l2 + 31) / 32); // l2;
const int blocksPerGrid = batchsize;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(
g_x.scalar_type(), "line_search_cu", ([&] {
line_search_kernel_mask<scalar_t, scalar_t>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
// m_idx.data_ptr<int>(),
best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
l1, l2, batchsize);
}));
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {best_x, best_c, best_grad};
}

View File

@@ -0,0 +1,790 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include "helper_math.h"
#include <cmath>
#include <cuda_fp16.h>
#include <vector>
#define SINGLE_GOAL 0
#define BATCH_GOAL 1
#define GOALSET 2
#define BATCH_GOALSET 3
namespace Curobo
{
namespace Pose
{
__device__ __forceinline__ void
transform_vec_quat(const float4 q, // x,y,z, qw, qx,qy,qz
const float *d_vec_weight,
float *C) {
// do dot product:
// new_p = q * p * q_inv + obs_p
if (false || q.x != 0 || q.y != 0 || q.z != 0) {
C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] +
2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
}
{
C[0] = d_vec_weight[0];
C[1] = d_vec_weight[1];
C[2] = d_vec_weight[2];
C[3] = d_vec_weight[3];
C[4] = d_vec_weight[4];
C[5] = d_vec_weight[5];
}
}
__device__ __forceinline__ void
inv_transform_vec_quat(const float4 q_in, // x,y,z, qw, qx,qy,qz
const float *d_vec_weight,
float *C) {
// do dot product:
// new_p = q * p * q_inv + obs_p
if (q_in.x != 0 || q_in.y != 0 || q_in.z != 0) {
float4 q = make_float4(-q_in.x, -q_in.y, -q_in.z, q_in.w);
C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] +
2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
}
{
C[0] = d_vec_weight[0];
C[1] = d_vec_weight[1];
C[2] = d_vec_weight[2];
C[3] = d_vec_weight[3];
C[4] = d_vec_weight[4];
C[5] = d_vec_weight[5];
}
}
__device__ __forceinline__ void
compute_quat_distance(float *result, const float4 a, const float4 b) {
// We compute distance with the conjugate of b
float r_w = (a.w * b.w + a.x * b.x + a.y * b.y + a.z * b.z);
if (r_w < 0.0) {
r_w = 1.0;
} else {
r_w = -1.0;
}
result[0] = r_w * (-1 * a.w * b.x + b.w * a.x - a.y * b.z + b.y * a.z);
result[1] = r_w * (-1 * a.w * b.y + b.w * a.y - a.z * b.x + b.z * a.x);
result[2] = r_w * (-1 * a.w * b.z + b.w * a.z - a.x * b.y + b.x * a.y);
}
__device__ __forceinline__ void
compute_distance(float *distance_vec, float &distance, float &position_distance,
float &rotation_distance, const float3 current_position,
const float3 goal_position, const float4 current_quat,
const float4 goal_quat, const float *vec_weight,
const float *vec_convergence, const float position_weight,
const float rotation_weight) {
compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
// compute position distance
*(float3 *)&distance_vec[0] = current_position - goal_position;
// distance_vec[0] = goal_position
position_distance = 0;
rotation_distance = 0;
// scale by vec weight and position weight:
#pragma unroll 3
for (int i = 0; i < 3; i++) {
distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
position_distance += distance_vec[i] * distance_vec[i];
}
#pragma unroll 3
for (int i = 3; i < 6; i++) {
distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
rotation_distance += distance_vec[i] * distance_vec[i];
}
distance = 0;
if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
rotation_distance = sqrtf(rotation_distance);
distance += rotation_weight * rotation_distance;
}
if (position_distance > vec_convergence[1] * vec_convergence[1]) {
position_distance = sqrtf(position_distance);
distance += position_weight * position_distance;
}
}
__device__ __forceinline__ void compute_metric_distance(
float *distance_vec, float &distance, float &position_distance,
float &rotation_distance, const float3 current_position,
const float3 goal_position, const float4 current_quat,
const float4 goal_quat, const float *vec_weight,
const float *vec_convergence, const float position_weight,
const float p_alpha, const float rotation_weight, const float r_alpha) {
compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
// compute position distance
*(float3 *)&distance_vec[0] = current_position - goal_position;
// distance_vec[0] = goal_position
position_distance = 0;
rotation_distance = 0;
// scale by vec weight and position weight:
#pragma unroll 3
for (int i = 0; i < 3; i++) {
distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
// position_distance += powf(distance_vec[i],2);
position_distance += distance_vec[i] * distance_vec[i];
}
#pragma unroll 3
for (int i = 3; i < 6; i++) {
distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
// rotation_distance += powf(distance_vec[i],2);
rotation_distance += distance_vec[i] * distance_vec[i];
}
distance = 0;
if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
rotation_distance = sqrtf(rotation_distance);
distance += rotation_weight * log2f(coshf(r_alpha * rotation_distance));
}
if (position_distance > vec_convergence[1] * vec_convergence[1]) {
position_distance = sqrtf(position_distance);
distance += position_weight * log2f(coshf(p_alpha * position_distance));
}
}
template <typename scalar_t>
__global__ void
backward_pose_distance_kernel(scalar_t *out_grad_p, // [b,3]
scalar_t *out_grad_q, // [b,4]
const scalar_t *grad_distance, // [b,1]
const scalar_t *grad_p_distance, // [b,1]
const scalar_t *grad_q_distance, // [b,1]
const scalar_t *pose_weight, // [2]
const scalar_t *grad_p_vec, // [b,3]
const scalar_t *grad_q_vec, // [b,4]
const int batch_size) {
const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (batch_idx >= batch_size) {
return;
}
// read data
const float g_distance = grad_distance[batch_idx];
const float2 p_weight = *(float2 *)&pose_weight[0];
const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
const float g_p_distance = grad_p_distance[batch_idx];
const float g_q_distance = grad_q_distance[batch_idx];
// compute position gradient
float3 g_p =
(g_p_v) * ((g_p_distance + g_distance * p_weight.y)); // scalar * float3
float3 g_q =
(g_q_v) * ((g_q_distance + g_distance * p_weight.x)); // scalar * float3
// write out
*(float3 *)&out_grad_p[batch_idx * 3] = g_p;
*(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
}
template <typename scalar_t>
__global__ void backward_pose_kernel(scalar_t *out_grad_p, // [b,3]
scalar_t *out_grad_q, // [b,4]
const scalar_t *grad_distance, // [b,1]
const scalar_t *pose_weight, // [2]
const scalar_t *grad_p_vec, // [b,3]
const scalar_t *grad_q_vec, // [b,4]
const int batch_size) {
const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (batch_idx >= batch_size) {
return;
}
// read data
const float g_distance = grad_distance[batch_idx];
const float2 p_weight = *(float2 *)&pose_weight[0];
const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
// compute position gradient
float3 g_p = (g_p_v) * ((g_distance * p_weight.y)); // scalar * float3
float3 g_q = (g_q_v) * ((g_distance * p_weight.x)); // scalar * float3
// write out
*(float3 *)&out_grad_p[batch_idx * 3] = g_p;
*(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
}
template <typename scalar_t, bool write_distance>
__global__ void goalset_pose_distance_kernel(
scalar_t *out_distance, scalar_t *out_position_distance,
scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
int32_t *out_gidx, const scalar_t *current_position,
const scalar_t *goal_position, const scalar_t *current_quat,
const scalar_t *goal_quat, const scalar_t *vec_weight,
const scalar_t *weight, const scalar_t *vec_convergence,
const scalar_t *run_weight, const scalar_t *run_vec_weight,
const int32_t *batch_pose_idx, const int mode, const int num_goals,
const int batch_size, const int horizon, const bool write_grad = false) {
const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
const int batch_idx = t_idx / horizon;
const int h_idx = t_idx - (batch_idx * horizon);
if (batch_idx >= batch_size || h_idx >= horizon) {
return;
}
// read current pose:
float3 position =
*(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
// read weights:
float position_weight = weight[1];
float rotation_weight = weight[0];
if (!write_distance) {
position_weight *= run_weight[h_idx];
rotation_weight *= run_weight[h_idx];
if (position_weight == 0.0 && rotation_weight == 0.0) {
return;
}
}
float3 l_goal_position;
float4 l_goal_quat;
float distance_vec[6]; // = {0.0};
float pose_distance = 0.0;
float position_distance = 0.0;
float rotation_distance = 0.0;
float best_distance = INFINITY;
float best_position_distance = 0.0;
float best_rotation_distance = 0.0;
float best_distance_vec[6] = {0.0};
float best_des_vec_weight[6] = {0.0};
float d_vec_convergence[2];
*(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
int best_idx = -1;
float d_vec_weight[6];
float des_vec_weight[6] = {0.0};
*(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
*(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
if (h_idx < horizon - 1) {
*(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
*(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
}
// read offset
int offset = batch_pose_idx[batch_idx];
if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
offset = (offset)*num_goals;
}
for (int k = 0; k < num_goals; k++) {
l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
compute_distance(&distance_vec[0], pose_distance, position_distance,
rotation_distance, position, l_goal_position, quat,
l_goal_quat,
&des_vec_weight[0], //&l_vec_weight[0],
&d_vec_convergence[0], //&l_vec_convergence[0],
position_weight, rotation_weight);
if (pose_distance <= best_distance) {
best_idx = k;
best_distance = pose_distance;
best_position_distance = position_distance;
best_rotation_distance = rotation_distance;
if (write_grad) {
//inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
#pragma unroll 6
for (int i = 0; i < 6; i++) {
best_distance_vec[i] = distance_vec[i];
best_des_vec_weight[i] = des_vec_weight[i];
}
}
}
}
// write out:
// write out pose distance:
out_distance[batch_idx * horizon + h_idx] = best_distance;
if (write_distance) {
out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
}
out_gidx[batch_idx * horizon + h_idx] = best_idx;
if (write_grad) {
if (write_distance) {
position_weight = 1;
rotation_weight = 1;
}
if (best_position_distance > 0) {
best_position_distance = (position_weight / best_position_distance);
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
} else {
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
}
if (best_rotation_distance > 0) {
best_rotation_distance = rotation_weight / best_rotation_distance;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
} else {
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
}
}
}
template <typename scalar_t, bool write_distance>
__global__ void goalset_pose_distance_metric_kernel(
scalar_t *out_distance, scalar_t *out_position_distance,
scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
int32_t *out_gidx, const scalar_t *current_position,
const scalar_t *goal_position, const scalar_t *current_quat,
const scalar_t *goal_quat, const scalar_t *vec_weight,
const scalar_t *weight, const scalar_t *vec_convergence,
const scalar_t *run_weight, const scalar_t *run_vec_weight,
const int32_t *batch_pose_idx, const int mode, const int num_goals,
const int batch_size, const int horizon, const bool write_grad = false) {
const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
const int batch_idx = t_idx / horizon;
const int h_idx = t_idx - (batch_idx * horizon);
if (batch_idx >= batch_size || h_idx >= horizon) {
return;
}
// read current pose:
float3 position =
*(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
// read weights:
float position_weight = weight[1];
float p_w_alpha = weight[3];
float r_w_alpha = weight[2];
float rotation_weight = weight[0];
if (!write_distance) {
position_weight *= run_weight[h_idx];
rotation_weight *= run_weight[h_idx];
p_w_alpha *= run_weight[h_idx];
r_w_alpha *= run_weight[h_idx];
if (position_weight == 0.0 && rotation_weight == 0.0) {
return;
}
}
float d_vec_convergence[2];
*(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
float d_vec_weight[6];
*(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
*(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
if (h_idx < horizon - 1) {
*(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
*(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
}
float des_vec_weight[6] = {0.0};
float3 l_goal_position;
float4 l_goal_quat;
float distance_vec[6]; // = {0.0};
float pose_distance = 0.0;
float position_distance = 0.0;
float rotation_distance = 0.0;
float best_distance = INFINITY;
float best_position_distance = 0.0;
float best_rotation_distance = 0.0;
float best_distance_vec[6] = {0.0};
float best_des_vec_weight[6] = {0.0};
int best_idx = -1.0;
int offset = batch_pose_idx[batch_idx];
if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
offset = (offset)*num_goals;
}
for (int k = 0; k < num_goals; k++) {
l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
float4 gq4 =
*(float4 *)&goal_quat[(offset + k) * 4]; // reading qw, qx, qy, qz
l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
compute_metric_distance(
&distance_vec[0], pose_distance, position_distance, rotation_distance,
position, l_goal_position, quat, l_goal_quat,
&des_vec_weight[0], //&l_vec_weight[0],
&d_vec_convergence[0], //&l_vec_convergence[0],
position_weight, p_w_alpha, rotation_weight, r_w_alpha);
if (pose_distance <= best_distance) {
best_idx = k;
best_distance = pose_distance;
best_position_distance = position_distance;
best_rotation_distance = rotation_distance;
if (write_grad) {
// inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
#pragma unroll 6
for (int i = 0; i < 6; i++) {
best_distance_vec[i] = distance_vec[i];
best_des_vec_weight[i] = des_vec_weight[i];
}
}
}
}
// add scaling metric:
// best_distance = log2f(acoshf(best_distance));
// write out:
// write out pose distance:
out_distance[batch_idx * horizon + h_idx] = best_distance;
if (write_distance) {
out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
}
out_gidx[batch_idx * horizon + h_idx] = best_idx;
if (write_grad) {
// write gradient
// compute scalar term:
// -w * (d_vec)/ (length * length(negative +1) * acos(length)
// -w * (d_vec) * sinh(length) / (length * cosh(length))
// compute extra term:
if (write_distance) {
position_weight = 1.0;
rotation_weight = 1.0;
}
if (best_position_distance > 0) {
best_position_distance =
(p_w_alpha * position_weight *
sinhf(p_w_alpha * best_position_distance)) /
(best_position_distance * coshf(p_w_alpha * best_position_distance));
// best_position_distance = (position_weight/best_position_distance);
// comput scalar gradient
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
} else {
out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
}
if (best_rotation_distance > 0) {
best_rotation_distance =
(r_w_alpha * rotation_weight *
sinhf(r_w_alpha * best_rotation_distance)) /
(best_rotation_distance * coshf(r_w_alpha * best_rotation_distance));
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
} else {
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
}
}
}
} // namespace
}
std::vector<torch::Tensor>
pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
torch::Tensor out_rotation_distance,
torch::Tensor distance_p_vector, // batch size, 3
torch::Tensor distance_q_vector, // batch size, 4
torch::Tensor out_gidx,
const torch::Tensor current_position, // batch_size, 3
const torch::Tensor goal_position, // n_boxes, 3
const torch::Tensor current_quat, const torch::Tensor goal_quat,
const torch::Tensor vec_weight, // n_boxes, 4, 4
const torch::Tensor weight, const torch::Tensor vec_convergence,
const torch::Tensor run_weight,
const torch::Tensor run_vec_weight,
const torch::Tensor batch_pose_idx, // batch_size, 1
const int batch_size, const int horizon, const int mode,
const int num_goals = 1, const bool compute_grad = false,
const bool write_distance = true, const bool use_metric = false) {
using namespace Curobo::Pose;
// we compute the warp threads based on number of boxes:
assert(batch_pose_idx.size(0) == batch_size);
// TODO: verify this math
// const int batch_size = out_distance.size(0);
assert(run_weight.size(-1) == horizon);
const int bh = batch_size * horizon;
int threadsPerBlock = bh;
if (bh > 128) {
threadsPerBlock = 128;
}
// we fit warp thread spheres in a threadsPerBlock
int blocksPerGrid = (bh + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (use_metric) {
if (write_distance)
{
AT_DISPATCH_FLOATING_TYPES(
current_position.scalar_type(), "batch_pose_distance", ([&] {
goalset_pose_distance_metric_kernel
<scalar_t, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_position_distance.data_ptr<scalar_t>(),
out_rotation_distance.data_ptr<scalar_t>(),
distance_p_vector.data_ptr<scalar_t>(),
distance_q_vector.data_ptr<scalar_t>(),
out_gidx.data_ptr<int32_t>(),
current_position.data_ptr<scalar_t>(),
goal_position.data_ptr<scalar_t>(),
current_quat.data_ptr<scalar_t>(),
goal_quat.data_ptr<scalar_t>(),
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
vec_convergence.data_ptr<scalar_t>(),
run_weight.data_ptr<scalar_t>(),
run_vec_weight.data_ptr<scalar_t>(),
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
batch_size, horizon, compute_grad);
}));
}
else
{
AT_DISPATCH_FLOATING_TYPES(
current_position.scalar_type(), "batch_pose_distance", ([&] {
goalset_pose_distance_metric_kernel
// goalset_pose_distance_kernel
<scalar_t, false><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_position_distance.data_ptr<scalar_t>(),
out_rotation_distance.data_ptr<scalar_t>(),
distance_p_vector.data_ptr<scalar_t>(),
distance_q_vector.data_ptr<scalar_t>(),
out_gidx.data_ptr<int32_t>(),
current_position.data_ptr<scalar_t>(),
goal_position.data_ptr<scalar_t>(),
current_quat.data_ptr<scalar_t>(),
goal_quat.data_ptr<scalar_t>(),
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
vec_convergence.data_ptr<scalar_t>(),
run_weight.data_ptr<scalar_t>(),
run_vec_weight.data_ptr<scalar_t>(),
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
batch_size, horizon, compute_grad);
}));
}
} else {
if(write_distance)
{
AT_DISPATCH_FLOATING_TYPES(
current_position.scalar_type(), "batch_pose_distance", ([&] {
// goalset_pose_distance_metric_kernel
goalset_pose_distance_kernel<scalar_t, true>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_position_distance.data_ptr<scalar_t>(),
out_rotation_distance.data_ptr<scalar_t>(),
distance_p_vector.data_ptr<scalar_t>(),
distance_q_vector.data_ptr<scalar_t>(),
out_gidx.data_ptr<int32_t>(),
current_position.data_ptr<scalar_t>(),
goal_position.data_ptr<scalar_t>(),
current_quat.data_ptr<scalar_t>(),
goal_quat.data_ptr<scalar_t>(),
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
vec_convergence.data_ptr<scalar_t>(),
run_weight.data_ptr<scalar_t>(),
run_vec_weight.data_ptr<scalar_t>(),
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
batch_size, horizon, compute_grad);
}));
}
else
{
AT_DISPATCH_FLOATING_TYPES(
current_position.scalar_type(), "batch_pose_distance", ([&] {
// goalset_pose_distance_metric_kernel
goalset_pose_distance_kernel<scalar_t, false>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_position_distance.data_ptr<scalar_t>(),
out_rotation_distance.data_ptr<scalar_t>(),
distance_p_vector.data_ptr<scalar_t>(),
distance_q_vector.data_ptr<scalar_t>(),
out_gidx.data_ptr<int32_t>(),
current_position.data_ptr<scalar_t>(),
goal_position.data_ptr<scalar_t>(),
current_quat.data_ptr<scalar_t>(),
goal_quat.data_ptr<scalar_t>(),
vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
vec_convergence.data_ptr<scalar_t>(),
run_weight.data_ptr<scalar_t>(),
run_vec_weight.data_ptr<scalar_t>(),
batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
batch_size, horizon, compute_grad);
}));
}
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {out_distance, out_position_distance, out_rotation_distance,
distance_p_vector, distance_q_vector, out_gidx};
}
std::vector<torch::Tensor>
backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
const torch::Tensor grad_distance, // batch_size, 3
const torch::Tensor grad_p_distance, // n_boxes, 3
const torch::Tensor grad_q_distance,
const torch::Tensor pose_weight,
const torch::Tensor grad_p_vec, // n_boxes, 4, 4
const torch::Tensor grad_q_vec, const int batch_size,
const bool use_distance = false) {
// we compute the warp threads based on number of boxes:
// TODO: verify this math
// const int batch_size = grad_distance.size(0);
using namespace Curobo::Pose;
int threadsPerBlock = batch_size;
if (batch_size > 128) {
threadsPerBlock = 128;
}
// we fit warp thread spheres in a threadsPerBlock
int blocksPerGrid = (batch_size + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (use_distance) {
AT_DISPATCH_FLOATING_TYPES(
grad_distance.scalar_type(), "backward_pose_distance", ([&] {
backward_pose_distance_kernel<scalar_t>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
out_grad_p.data_ptr<scalar_t>(),
out_grad_q.data_ptr<scalar_t>(),
grad_distance.data_ptr<scalar_t>(),
grad_p_distance.data_ptr<scalar_t>(),
grad_q_distance.data_ptr<scalar_t>(),
pose_weight.data_ptr<scalar_t>(),
grad_p_vec.data_ptr<scalar_t>(),
grad_q_vec.data_ptr<scalar_t>(), batch_size);
}));
} else {
AT_DISPATCH_FLOATING_TYPES(
grad_distance.scalar_type(), "backward_pose", ([&] {
backward_pose_kernel<scalar_t>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
out_grad_p.data_ptr<scalar_t>(),
out_grad_q.data_ptr<scalar_t>(),
grad_distance.data_ptr<scalar_t>(),
pose_weight.data_ptr<scalar_t>(),
grad_p_vec.data_ptr<scalar_t>(),
grad_q_vec.data_ptr<scalar_t>(), batch_size);
}));
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {out_grad_p, out_grad_q};
}

View File

@@ -0,0 +1,643 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include "helper_math.h"
#include <cmath>
#include <cuda_fp16.h>
#include <vector>
#define SINGLE_GOAL 0
#define BATCH_GOAL 1
#define GOALSET 2
#define BATCH_GOALSET 3
namespace Curobo {
namespace Geometry {
template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
if (var < 0)
return 0;
else
return var;
}
template <typename scalar_t>
__global__ void self_collision_distance_kernel(
scalar_t *out_distance, // batch x 1
scalar_t *out_vec, // batch x nspheres x 4
const scalar_t *robot_spheres, // batch x nspheres x 4
const scalar_t *collision_threshold, const int batch_size,
const int nspheres, const scalar_t *weight, const bool write_grad = false) {
const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (batch_idx >= batch_size) {
return;
}
float r_diff, distance;
float max_penetration = 0;
float3 sph1, sph2, dist_vec;
int sph1_idx = -1;
int sph2_idx = -1;
// iterate over spheres:
for (int i = 0; i < nspheres; i++) {
sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
for (int j = i + 1; j < nspheres; j++) {
r_diff = collision_threshold[i * nspheres + j];
if (isinf(r_diff)) {
continue;
}
sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
// compute sphere distance:
distance = relu(r_diff - length(sph1 - sph2));
if (distance > max_penetration) {
max_penetration = distance;
sph1_idx = i;
sph2_idx = j;
if (write_grad) {
dist_vec = (sph1 - sph2) / distance;
}
}
}
}
// write out pose distance:
if (max_penetration > 0) {
out_distance[batch_idx] = weight[0] * max_penetration;
if (write_grad) {
*(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
weight[0] * -1 * dist_vec;
*(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
weight[0] * dist_vec;
}
}
}
typedef struct {
float d;
int16_t i;
int16_t j;
} dist_t;
///////////////////////////////////////////////////////////
// n warps per row
// ndpt rows per warp
///////////////////////////////////////////////////////////
template <typename scalar_t>
__global__ void self_collision_distance_kernel4(
scalar_t *out_distance, // batch x 1
scalar_t *out_vec, // batch x nspheres x 3
const scalar_t *robot_spheres, // batch x nspheres x 3
const scalar_t *offsets, const uint8_t *coll_matrix, const int batch_size,
const int nspheres,
const int ndpt, // number of distances to be computed per thread
const int nwpr, // number of warps per row
const scalar_t *weight, uint8_t *sparse_index,
const bool write_grad = false) {
int batch_idx = blockIdx.x;
int warp_idx = threadIdx.x / 32;
int i = ndpt * (warp_idx / nwpr); // starting row number for this warp
int j = (warp_idx % nwpr) * 32; // starting column number for this warp
dist_t max_d = {.d = 0.0, .i = 0, .j = 0};
__shared__ dist_t max_darr[32];
// Optimization: About 1/3 of the warps will have no work.
// We compute distances only when i<j. If i is never <j in this warp, we have
// no work
if (i > j + 31) { // this warp has no work
max_darr[warp_idx] = max_d;
return;
}
// load robot_spheres to shared memory
extern __shared__ float4 __rs_shared[];
if (threadIdx.x < nspheres) {
float4 sph = *(float4 *)&robot_spheres[4 * (batch_idx * nspheres + threadIdx.x)];
//float4 sph = make_float4(robot_spheres[3 * (batch_idx * nspheres + threadIdx.x)],
//robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 1],
//robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 2],
//robot_spheres_radius[threadIdx.x]) ;
sph.w += offsets[threadIdx.x];
__rs_shared[threadIdx.x] = sph;
}
__syncthreads();
//////////////////////////////////////////////////////
// Compute distances and store the maximum per thread
// in registers (max_d).
// Each thread computes upto ndpt distances.
// two warps per row
//////////////////////////////////////////////////////
// int nspheres_2 = nspheres * nspheres;
j = j + threadIdx.x % 32; // column number for this thread
float4 sph2;
if (j < nspheres) {
sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
#pragma unroll 16
for (int k = 0; k < ndpt; k++, i++) { // increment i also here
if (i < nspheres && j > i) {
// check if self collision is allowed here:
if (coll_matrix[i * nspheres + j] == 1) {
float4 sph1 = __rs_shared[i];
if (sph1.w <= 0.0 || sph2.w <= 0.0)
{
continue;
}
float r_diff = sph1.w + sph2.w;
float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
(sph1.y - sph2.y) * (sph1.y - sph2.y) +
(sph1.z - sph2.z) * (sph1.z - sph2.z));
//float distance = max((float)0.0, (float)(r_diff - d));
float distance = r_diff - d;
// printf("%d, %d: (%d, %d) %f new\n", blockIdx.x, threadIdx.x, i, j,
// distance);
if (distance > max_d.d) {
max_d.d = distance;
max_d.i = i;
max_d.j = j;
}
}
}
}
}
// max_d has the result max for this thread
//////////////////////////////////////////////////////
// Reduce gridDim.x values using gridDim.x threads
//////////////////////////////////////////////////////
// Perform warp-wide reductions
// Optimization: Skip the reduction if all the values are zero
unsigned zero_mask = __ballot_sync(
0xffffffff, max_d.d != 0.0); // we expect most values to be 0. So,
// zero_mask should be 0 in the common case.
if (zero_mask != 0) { // some of the values are non-zero
unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
if (threadIdx.x < blockDim.x) {
// dist_t max_d = dist_sh[threadIdx.x];
for (int offset = 16; offset > 0; offset /= 2) {
uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);
dist_t d_temp = *(dist_t *)&nd;
if (d_temp.d > max_d.d) {
max_d = d_temp;
}
}
}
}
// thread0 in the warp has the max_d for the warp
if (threadIdx.x % 32 == 0) {
max_darr[warp_idx] = max_d;
// printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
// blockIdx.x, max_d);
}
if (threadIdx.x < nspheres) {
if (write_grad && sparse_index[batch_idx * nspheres + threadIdx.x] != 0) {
*(float4 *)&out_vec[batch_idx * nspheres * 4 + threadIdx.x * 4] =
make_float4(0.0);
sparse_index[batch_idx * nspheres + threadIdx.x] = 0;
}
}
__syncthreads();
if (threadIdx.x == 0) {
dist_t max_d = max_darr[0];
// TODO: This can be parallized
for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
if (max_darr[i].d > max_d.d) {
max_d = max_darr[i];
}
}
//////////////////////////////////////////////////////
// Write out the final results
//////////////////////////////////////////////////////
if (max_d.d != 0.0) {
out_distance[batch_idx] = weight[0] * max_d.d;
if (write_grad) {
float3 sph1 =
*(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.i)];
float3 sph2 =
*(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.j)];
float3 dist_vec = (sph1 - sph2) / max_d.d;
*(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.i * 4] =
weight[0] * -1 * dist_vec;
*(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.j * 4] =
weight[0] * dist_vec;
sparse_index[batch_idx * nspheres + max_d.i] = 1;
sparse_index[batch_idx * nspheres + max_d.j] = 1;
}
} else {
out_distance[batch_idx] = 0;
}
}
}
template <typename scalar_t, int ndpt, int NBPB>
__global__ void self_collision_distance_kernel7(
scalar_t *out_distance, // batch x 1
scalar_t *out_vec, // batch x nspheres x 3
uint8_t *sparse_index,
const scalar_t *robot_spheres, // batch x nspheres x 3
const scalar_t *offsets, // nspheres
const scalar_t *weight, const int16_t *locations_, const int batch_size,
const int nspheres, const bool write_grad = false) {
uint32_t batch_idx = blockIdx.x * NBPB;
uint8_t nbpb = min(NBPB, batch_size - batch_idx);
if (nbpb == 0)
return;
// Layout in shared memory:
// sphere1[batch=0] sphere1[batch=1] sphere1[batch=2] sphere1[batch=4]
// sphere2[batch=0] sphere2[batch=1] sphere2[batch=2] sphere2[batch=4]
// ...
extern __shared__ float4 __rs_shared[];
if (threadIdx.x < nspheres) { // threadIdx.x is sphere index
#pragma unroll
for (int l = 0; l < nbpb; l++) {
float4 sph = *(float4 *)&robot_spheres[4 * ((batch_idx + l) * nspheres + threadIdx.x)];
//float4 sph = make_float4(
// robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x)],
// robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 1],
// robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 2],
// robot_spheres_radius[threadIdx.x]
//);
sph.w += offsets[threadIdx.x];
__rs_shared[NBPB * threadIdx.x + l] = sph;
}
}
__syncthreads();
//////////////////////////////////////////////////////
// Compute distances and store the maximum per thread
// in registers (max_d).
// Each thread computes upto ndpt distances.
//////////////////////////////////////////////////////
dist_t max_d[NBPB] = {{.d = 0.0, .i = 0, .j = 0}};
int16_t indices[ndpt * 2];
for (uint8_t i =0; i< ndpt * 2; i++)
{
indices[i] = locations_[(threadIdx.x) * 2 * ndpt + i];
}
#pragma unroll
for (uint8_t k = 0; k < ndpt; k++) {
// We are iterating through ndpt pair of spheres across batch
// if we increase ndpt, then we can compute for more spheres?
int i = indices[k * 2];
int j = indices[k * 2 + 1];
if (i == -1 || j == -1)
continue;
#pragma unroll
for (uint16_t l = 0; l < nbpb; l++) { // iterate through nbpb batches
float4 sph1 = __rs_shared[NBPB * i + l];
float4 sph2 = __rs_shared[NBPB * j + l];
if (sph1.w <= 0.0 || sph2.w <= 0.0)
{
continue;
}
float r_diff =
sph1.w + sph2.w; // sum of two radii, radii include respective offsets
float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
(sph1.y - sph2.y) * (sph1.y - sph2.y) +
(sph1.z - sph2.z) * (sph1.z - sph2.z));
float f_diff = r_diff - d;
if (f_diff > max_d[l].d) {
max_d[l].d = f_diff;
max_d[l].i = i;
max_d[l].j = j;
}
}
}
// max_d has the result max for this thread
//////////////////////////////////////////////////////
// Reduce gridDim.x values using gridDim.x threads
//////////////////////////////////////////////////////
// We find the sum across 32 threads. Hence, we are limited to running all our self collision
// distances for a batch_idx to 32 threads.
__shared__ dist_t max_darr[32 * NBPB];
#pragma unroll
for (uint8_t l = 0; l < nbpb; l++) {
// Perform warp-wide reductions
// Optimization: Skip the reduction if all the values are zero
unsigned zero_mask = __ballot_sync(
0xffffffff,
max_d[l].d != 0.0); // we expect most values to be 0. So, zero_mask
// should be 0 in the common case.
if (zero_mask != 0) { // some of the values are non-zero
unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
if (threadIdx.x < blockDim.x) {
// dist_t max_d = dist_sh[threadIdx.x];
for (int offset = 16; offset > 0; offset /= 2) {
// the offset here is linked to ndpt?
uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d[l], offset);
dist_t d_temp = *(dist_t *)&nd;
if (d_temp.d > max_d[l].d) {
max_d[l] = d_temp;
}
}
}
}
// thread0 in the warp has the max_d for the warp
if (threadIdx.x % 32 == 0) {
max_darr[(threadIdx.x / 32) + 32 * l] = max_d[l];
// printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
// blockIdx.x, max_d);
}
}
if (threadIdx.x < nspheres) {
for (int l = 0; l < nbpb; l++) {
if (write_grad &&
sparse_index[(batch_idx + l) * nspheres + threadIdx.x] != 0) {
*(float4 *)&out_vec[(batch_idx + l) * nspheres * 4 + threadIdx.x * 4] =
make_float4(0.0);
sparse_index[(batch_idx + l) * nspheres + threadIdx.x] = 0;
}
}
}
__syncthreads();
if (threadIdx.x == 0) {
#pragma unroll
for (uint8_t l = 0; l < nbpb; l++) {
dist_t max_d = max_darr[l * 32];
// TODO: This can be parallized
for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
if (max_darr[l * 32 + i].d > max_d.d) {
max_d = max_darr[l * 32 + i];
}
}
//////////////////////////////////////////////////////
// Write out the final results
//////////////////////////////////////////////////////
if (max_d.d != 0.0) {
out_distance[batch_idx + l] = weight[0] * max_d.d;
if (write_grad) {
float3 sph1 =
*(float3 *)&robot_spheres[4 *
((batch_idx + l) * nspheres + max_d.i)];
float3 sph2 =
*(float3 *)&robot_spheres[4 *
((batch_idx + l) * nspheres + max_d.j)];
float3 dist_vec = (sph1 - sph2) / max_d.d;
*(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.i * 4] =
weight[0] * -1 * dist_vec;
*(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.j * 4] =
weight[0] * dist_vec;
sparse_index[(batch_idx + l) * nspheres + max_d.i] = 1;
sparse_index[(batch_idx + l) * nspheres + max_d.j] = 1;
}
} else {
out_distance[batch_idx + l] = 0;
}
}
}
}
} // namespace Geometry
} // namespace Curobo
// This is the best version so far.
// It precomputes the start addresses per thread on the cpu.
// The rest is similar to the version above.
std::vector<torch::Tensor> self_collision_distance(
torch::Tensor out_distance, torch::Tensor out_vec,
torch::Tensor sparse_index,
const torch::Tensor robot_spheres, // batch_size x n_spheres x 3
const torch::Tensor collision_offset, // n_spheres x n_spheres
const torch::Tensor weight, const torch::Tensor collision_matrix,
const torch::Tensor thread_locations, const int thread_locations_size,
const int batch_size, const int nspheres, const bool compute_grad = false,
const int ndpt = 8, // Does this need to match template?
const bool experimental_kernel = false) {
using namespace Curobo::Geometry;
// use efficient kernel based on number of threads:
const int nbpb = 1;
assert(nspheres < 1024);
int threadsPerBlock = ((thread_locations_size / 2) + ndpt - 1) /
ndpt; // location_size must be an even number. We store
// i,j for each sphere pair.
//assert(threadsPerBlock/nbpb <=32);
if (threadsPerBlock < 32*nbpb)
{
threadsPerBlock = 32 * nbpb;
}
if (threadsPerBlock < nspheres)
{
threadsPerBlock = nspheres;
}
int blocksPerGrid = (batch_size + nbpb - 1) / nbpb; // number of batches per block
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// ((threadsPerBlock >= nspheres))&&
if (experimental_kernel) {
int smemSize = nbpb * nspheres * sizeof(float4);
if (ndpt == 1)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 1, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 2)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 2, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 4)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 4, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 8)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 8, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 32)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 32, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 64)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 64, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 128)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 128, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else if (ndpt == 512)
{
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel7<scalar_t, 512, nbpb>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
compute_grad);
}));
}
else
{
assert(false); // only ndpt of 32 or 64 is currently supported.
}
}
else {
int ndpt_n = 32; // number of distances to be computed per thread
int nwpr = (nspheres + 31) / 32;
int warpsPerBlock = nwpr * ((nspheres + ndpt_n - 1) / ndpt_n);
threadsPerBlock = warpsPerBlock * 32;
blocksPerGrid = batch_size;
// printf("Blocks: %d, threads/block:%d, ndpt=%d, nwpr=%d\n", blocksPerGrid,
// threadsPerBlock, ndpt, nwpr);
int smemSize = nspheres * sizeof(float4);
AT_DISPATCH_FLOATING_TYPES(
robot_spheres.scalar_type(), "self_collision_distance", ([&] {
self_collision_distance_kernel4<scalar_t>
<<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
out_distance.data_ptr<scalar_t>(),
out_vec.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
collision_offset.data_ptr<scalar_t>(),
collision_matrix.data_ptr<uint8_t>(), batch_size, nspheres,
ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
sparse_index.data_ptr<uint8_t>(), compute_grad);
}));
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {out_distance, out_vec, sparse_index};
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,257 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <vector>
std::vector<torch::Tensor> step_position_clique(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_position, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor traj_dt, const int batch_size, const int horizon,
const int dof);
std::vector<torch::Tensor> step_position_clique2(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_position, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor traj_dt, const int batch_size, const int horizon,
const int dof, const int mode);
std::vector<torch::Tensor> step_position_clique2_idx(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_position, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor start_idx, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof, const int mode);
std::vector<torch::Tensor> backward_step_position_clique(
torch::Tensor out_grad_position, const torch::Tensor grad_position,
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof);
std::vector<torch::Tensor> backward_step_position_clique2(
torch::Tensor out_grad_position, const torch::Tensor grad_position,
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof, const int mode);
std::vector<torch::Tensor>
step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_acc, const torch::Tensor start_position,
const torch::Tensor start_velocity,
const torch::Tensor start_acceleration,
const torch::Tensor traj_dt, const int batch_size,
const int horizon, const int dof, const bool use_rk2 = true);
std::vector<torch::Tensor> step_acceleration_idx(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_acc, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor start_idx, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof,
const bool use_rk2 = true);
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor> step_position_clique_wrapper(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_position, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor traj_dt, const int batch_size, const int horizon,
const int dof) {
const at::cuda::OptionalCUDAGuard guard(u_position.device());
CHECK_INPUT(u_position);
CHECK_INPUT(out_position);
CHECK_INPUT(out_velocity);
CHECK_INPUT(out_acceleration);
CHECK_INPUT(out_jerk);
CHECK_INPUT(start_position);
CHECK_INPUT(start_velocity);
CHECK_INPUT(start_acceleration);
CHECK_INPUT(traj_dt);
return step_position_clique(out_position, out_velocity, out_acceleration,
out_jerk, u_position, start_position,
start_velocity, start_acceleration, traj_dt,
batch_size, horizon, dof);
}
std::vector<torch::Tensor> step_position_clique2_wrapper(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_position, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor traj_dt, const int batch_size, const int horizon,
const int dof,
const int mode) {
const at::cuda::OptionalCUDAGuard guard(u_position.device());
CHECK_INPUT(u_position);
CHECK_INPUT(out_position);
CHECK_INPUT(out_velocity);
CHECK_INPUT(out_acceleration);
CHECK_INPUT(out_jerk);
CHECK_INPUT(start_position);
CHECK_INPUT(start_velocity);
CHECK_INPUT(start_acceleration);
CHECK_INPUT(traj_dt);
return step_position_clique2(out_position, out_velocity, out_acceleration,
out_jerk, u_position, start_position,
start_velocity, start_acceleration, traj_dt,
batch_size, horizon, dof, mode);
}
std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_position, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor start_idx, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof,
const int mode) {
const at::cuda::OptionalCUDAGuard guard(u_position.device());
CHECK_INPUT(u_position);
CHECK_INPUT(out_position);
CHECK_INPUT(out_velocity);
CHECK_INPUT(out_acceleration);
CHECK_INPUT(out_jerk);
CHECK_INPUT(start_position);
CHECK_INPUT(start_velocity);
CHECK_INPUT(start_acceleration);
CHECK_INPUT(traj_dt);
CHECK_INPUT(start_idx);
return step_position_clique2_idx(
out_position, out_velocity, out_acceleration, out_jerk, u_position,
start_position, start_velocity, start_acceleration, start_idx, traj_dt,
batch_size, horizon, dof, mode);
}
std::vector<torch::Tensor> backward_step_position_clique_wrapper(
torch::Tensor out_grad_position, const torch::Tensor grad_position,
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof) {
const at::cuda::OptionalCUDAGuard guard(grad_position.device());
CHECK_INPUT(out_grad_position);
CHECK_INPUT(grad_position);
CHECK_INPUT(grad_velocity);
CHECK_INPUT(grad_acceleration);
CHECK_INPUT(grad_jerk);
CHECK_INPUT(traj_dt);
return backward_step_position_clique(
out_grad_position, grad_position, grad_velocity, grad_acceleration,
grad_jerk, traj_dt, batch_size, horizon, dof);
}
std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
torch::Tensor out_grad_position, const torch::Tensor grad_position,
const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof,
const int mode) {
const at::cuda::OptionalCUDAGuard guard(grad_position.device());
CHECK_INPUT(out_grad_position);
CHECK_INPUT(grad_position);
CHECK_INPUT(grad_velocity);
CHECK_INPUT(grad_acceleration);
CHECK_INPUT(grad_jerk);
CHECK_INPUT(traj_dt);
return backward_step_position_clique2(
out_grad_position, grad_position, grad_velocity, grad_acceleration,
grad_jerk, traj_dt, batch_size, horizon, dof, mode);
}
std::vector<torch::Tensor> step_acceleration_wrapper(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_acc, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor traj_dt, const int batch_size, const int horizon,
const int dof, const bool use_rk2 = true) {
const at::cuda::OptionalCUDAGuard guard(u_acc.device());
CHECK_INPUT(u_acc);
CHECK_INPUT(out_position);
CHECK_INPUT(out_velocity);
CHECK_INPUT(out_acceleration);
CHECK_INPUT(out_jerk);
CHECK_INPUT(start_position);
CHECK_INPUT(start_velocity);
CHECK_INPUT(start_acceleration);
CHECK_INPUT(traj_dt);
return step_acceleration(out_position, out_velocity, out_acceleration,
out_jerk, u_acc, start_position, start_velocity,
start_acceleration, traj_dt, batch_size, horizon,
dof, use_rk2);
}
std::vector<torch::Tensor> step_acceleration_idx_wrapper(
torch::Tensor out_position, torch::Tensor out_velocity,
torch::Tensor out_acceleration, torch::Tensor out_jerk,
const torch::Tensor u_acc, const torch::Tensor start_position,
const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
const torch::Tensor start_idx, const torch::Tensor traj_dt,
const int batch_size, const int horizon, const int dof,
const bool use_rk2 = true) {
const at::cuda::OptionalCUDAGuard guard(u_acc.device());
CHECK_INPUT(u_acc);
CHECK_INPUT(out_position);
CHECK_INPUT(out_velocity);
CHECK_INPUT(out_acceleration);
CHECK_INPUT(out_jerk);
CHECK_INPUT(start_position);
CHECK_INPUT(start_velocity);
CHECK_INPUT(start_acceleration);
CHECK_INPUT(start_idx);
CHECK_INPUT(traj_dt);
return step_acceleration_idx(out_position, out_velocity, out_acceleration,
out_jerk, u_acc, start_position, start_velocity,
start_acceleration, start_idx, traj_dt,
batch_size, horizon, dof, use_rk2);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("step_position", &step_position_clique_wrapper,
"Tensor Step Position (curobolib)");
m.def("step_position2", &step_position_clique2_wrapper,
"Tensor Step Position (curobolib)");
m.def("step_idx_position2", &step_position_clique2_idx_wrapper,
"Tensor Step Position (curobolib)");
m.def("step_position_backward", &backward_step_position_clique_wrapper,
"Tensor Step Position (curobolib)");
m.def("step_position_backward2", &backward_step_position_clique2_wrapper,
"Tensor Step Position (curobolib)");
m.def("step_acceleration", &step_acceleration_wrapper,
"Tensor Step Acceleration (curobolib)");
m.def("step_acceleration_idx", &step_acceleration_idx_wrapper,
"Tensor Step Acceleration (curobolib)");
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,126 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <cuda.h>
#include <torch/extension.h>
#include <vector>
#include <c10/cuda/CUDAStream.h>
#include <cuda_fp16.h>
// #include "helper_cuda.h"
#include "helper_math.h"
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cub/cub.cuh>
#include <math.h>
namespace Curobo {
namespace Optimization {
// We launch with d_opt*cost_s1 threads.
// We assume that cost_s2 is always 1.
template <typename scalar_t>
__global__ void update_best_kernel(scalar_t *best_cost, // 200x1
scalar_t *best_q, // 200x7
int16_t *best_iteration, // 200 x 1
int16_t *current_iteration, // 1
const scalar_t *cost, // 200x1
const scalar_t *q, // 200x7
const int d_opt, // 7
const int cost_s1, // 200
const int cost_s2,
const int iteration,
const float delta_threshold,
const float relative_threshold) // 1
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int size = cost_s1 * d_opt; // size of best_q
if (tid >= size) {
return;
}
const int cost_idx = tid / d_opt;
const float cost_new = cost[cost_idx];
const float best_cost_in = best_cost[cost_idx];
const bool change = (best_cost_in - cost_new) > delta_threshold && cost_new < best_cost_in * relative_threshold;
if (change) {
best_q[tid] = q[tid]; // update best_q
if (tid % d_opt == 0) {
best_cost[cost_idx] = cost_new ; // update best_cost
//best_iteration[cost_idx] = curr_iter + iteration; //
// this tensor keeps track of whether the cost reduced by at least
// delta_threshold.
// here iteration is the last_best parameter.
}
}
if (tid % d_opt == 0)
{
if (change)
{
best_iteration[cost_idx] = 0;
}
else
{
best_iteration[cost_idx] -= 1;
}
}
//.if (tid == 0)
//{
// curr_iter += 1;
// current_iteration[0] = curr_iter;
//}
}
} // namespace Optimization
} // namespace Curobo
std::vector<torch::Tensor>
update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
torch::Tensor best_iteration,
torch::Tensor current_iteration,
const torch::Tensor cost,
const torch::Tensor q, const int d_opt, const int cost_s1,
const int cost_s2, const int iteration,
const float delta_threshold,
const float relative_threshold = 0.999) {
using namespace Curobo::Optimization;
const int threadsPerBlock = 128;
const int cost_size = cost_s1 * d_opt;
assert(cost_s2 == 1); // assumption
const int blocksPerGrid = (cost_size + threadsPerBlock - 1) / threadsPerBlock;
// printf("cost_s1=%d, d_opt=%d, blocksPerGrid=%d\n", cost_s1, d_opt,
// blocksPerGrid);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(
cost.scalar_type(), "update_best_cu", ([&] {
update_best_kernel<scalar_t>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
best_iteration.data_ptr<int16_t>(),
current_iteration.data_ptr<int16_t>(),
cost.data_ptr<scalar_t>(),
q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
delta_threshold, relative_threshold);
}));
C10_CUDA_KERNEL_LAUNCH_CHECK();
return {best_cost, best_q, best_iteration};
}

View File

@@ -0,0 +1,405 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Third Party
import torch
# CuRobo
from curobo.util.logger import log_warn
try:
# CuRobo
from curobo.curobolib import geom_cu
except ImportError:
log_warn("geom_cu binary not found, jit compiling...")
# Third Party
from torch.utils.cpp_extension import load
# CuRobo
from curobo.util_file import add_cpp_path
geom_cu = load(
name="geom_cu",
sources=add_cpp_path(
[
"geom_cuda.cpp",
"sphere_obb_kernel.cu",
"pose_distance_kernel.cu",
"self_collision_kernel.cu",
]
),
)
def get_self_collision_distance(
out_distance,
out_vec,
sparse_index,
robot_spheres,
collision_offset,
weight,
coll_matrix,
thread_locations,
thread_size,
b_size,
nspheres,
compute_grad,
checks_per_thread=32,
experimental_kernel=True,
):
r = geom_cu.self_collision_distance(
out_distance,
out_vec,
sparse_index,
robot_spheres,
collision_offset,
weight,
coll_matrix,
thread_locations,
thread_size,
b_size,
nspheres,
compute_grad,
checks_per_thread,
experimental_kernel,
)
out_distance = r[0]
out_vec = r[1]
return out_distance, out_vec
class SelfCollisionDistance(torch.autograd.Function):
@staticmethod
def forward(
ctx,
out_distance,
out_vec,
sparse_idx,
robot_spheres,
sphere_offset,
weight,
coll_matrix,
thread_locations,
max_thread,
checks_per_thread: int,
experimental_kernel: bool,
return_loss: bool = False,
):
# get batch size
b, h, n_spheres, _ = robot_spheres.shape
out_distance, out_vec = get_self_collision_distance(
out_distance,
out_vec,
sparse_idx,
robot_spheres, # .view(-1, 4),
sphere_offset,
weight,
coll_matrix,
thread_locations,
max_thread,
b * h,
n_spheres,
robot_spheres.requires_grad,
checks_per_thread,
experimental_kernel,
)
ctx.return_loss = return_loss
ctx.save_for_backward(out_vec)
return out_distance
@staticmethod
def backward(ctx, grad_out_distance):
sphere_grad = None
if ctx.needs_input_grad[3]:
(g_vec,) = ctx.saved_tensors
if ctx.return_loss:
g_vec = g_vec * grad_out_distance.unsqueeze(1)
sphere_grad = g_vec
return None, None, None, sphere_grad, None, None, None, None, None, None, None, None
class SelfCollisionDistanceLoss(SelfCollisionDistance):
@staticmethod
def backward(ctx, grad_out_distance):
sphere_grad = None
if ctx.needs_input_grad[3]:
(g_vec,) = ctx.saved_tensors
sphere_grad = g_vec * grad_out_distance.unsqueeze(1)
return None, None, None, sphere_grad, None, None, None, None, None, None, None
def get_pose_distance(
out_distance,
out_position_distance,
out_rotation_distance,
out_p_vec,
out_q_vec,
out_idx,
current_position,
goal_position,
current_quat,
goal_quat,
vec_weight,
weight,
vec_convergence,
run_weight,
run_vec_weight,
batch_pose_idx,
batch_size,
horizon,
mode=1,
num_goals=1,
write_grad=False,
write_distance=False,
use_metric=False,
):
if batch_pose_idx.shape[0] != batch_size:
raise ValueError("Index buffer size is different from batch size")
r = geom_cu.pose_distance(
out_distance,
out_position_distance,
out_rotation_distance,
out_p_vec,
out_q_vec,
out_idx,
current_position,
goal_position.view(-1),
current_quat,
goal_quat.view(-1),
vec_weight,
weight,
vec_convergence,
run_weight,
run_vec_weight,
batch_pose_idx,
batch_size,
horizon,
mode,
num_goals,
write_grad,
write_distance,
use_metric,
)
out_distance = r[0]
out_position_distance = r[1]
out_rotation_distance = r[2]
out_p_vec = r[3]
out_q_vec = r[4]
out_idx = r[5]
return out_distance, out_position_distance, out_rotation_distance, out_p_vec, out_q_vec, out_idx
def get_pose_distance_backward(
out_grad_p,
out_grad_q,
grad_distance,
grad_p_distance,
grad_q_distance,
pose_weight,
grad_p_vec,
grad_q_vec,
batch_size,
use_distance=False,
):
r = geom_cu.pose_distance_backward(
out_grad_p,
out_grad_q,
grad_distance,
grad_p_distance,
grad_q_distance,
pose_weight,
grad_p_vec,
grad_q_vec,
batch_size,
use_distance,
)
return r[0], r[1]
class SdfSphereOBB(torch.autograd.Function):
@staticmethod
def forward(
ctx,
query_sphere,
out_buffer,
grad_out_buffer,
sparsity_idx,
weight,
activation_distance,
box_accel,
box_dims,
box_pose,
box_enable,
n_env_obb,
env_query_idx,
max_nobs,
batch_size,
horizon,
n_spheres,
transform_back,
compute_distance,
use_batch_env,
return_loss: bool = False,
):
r = geom_cu.closest_point(
query_sphere,
out_buffer,
grad_out_buffer,
sparsity_idx,
weight,
activation_distance,
box_accel,
box_dims,
box_pose,
box_enable,
n_env_obb,
env_query_idx,
max_nobs,
batch_size,
horizon,
n_spheres,
transform_back,
compute_distance,
use_batch_env,
)
# r[1][r[1]!=r[1]] = 0.0
ctx.return_loss = return_loss
ctx.save_for_backward(r[1])
return r[0]
@staticmethod
def backward(ctx, grad_output):
grad_pt = None
if ctx.needs_input_grad[0]:
(r,) = ctx.saved_tensors
if ctx.return_loss:
r = r * grad_output.unsqueeze(-1)
grad_pt = r
return (
grad_pt,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
class SdfSweptSphereOBB(torch.autograd.Function):
@staticmethod
def forward(
ctx,
query_sphere,
out_buffer,
grad_out_buffer,
sparsity_idx,
weight,
activation_distance,
speed_dt,
box_accel,
box_dims,
box_pose,
box_enable,
n_env_obb,
env_query_idx,
max_nobs,
batch_size,
horizon,
n_spheres,
sweep_steps,
enable_speed_metric,
transform_back,
compute_distance,
use_batch_env,
return_loss: bool = False,
):
r = geom_cu.swept_closest_point(
query_sphere,
out_buffer,
grad_out_buffer,
sparsity_idx,
weight,
activation_distance,
speed_dt,
box_accel,
box_dims,
box_pose,
box_enable,
n_env_obb,
env_query_idx,
max_nobs,
batch_size,
horizon,
n_spheres,
sweep_steps,
enable_speed_metric,
transform_back,
compute_distance,
use_batch_env,
)
ctx.return_loss = return_loss
ctx.save_for_backward(
r[1],
)
return r[0]
@staticmethod
def backward(ctx, grad_output):
grad_pt = None
if ctx.needs_input_grad[0]:
(r,) = ctx.saved_tensors
if ctx.return_loss:
r = r * grad_output.unsqueeze(-1)
grad_pt = r
return (
grad_pt,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)

View File

@@ -0,0 +1,498 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Third Party
import torch
from torch.autograd import Function
# CuRobo
from curobo.util.logger import log_warn
try:
# CuRobo
from curobo.curobolib import kinematics_fused_cu
except ImportError:
log_warn("kinematics_fused_cu not found, JIT compiling...")
# Third Party
from torch.utils.cpp_extension import load
# CuRobo
from curobo.curobolib.util_file import add_cpp_path
kinematics_fused_cu = load(
name="kinematics_fused_cu",
sources=add_cpp_path(
[
"kinematics_fused_cuda.cpp",
"kinematics_fused_kernel.cu",
]
),
)
def rotation_matrix_to_quaternion(in_mat, out_quat):
r = kinematics_fused_cu.matrix_to_quaternion(out_quat, in_mat.reshape(-1, 9))
return r[0]
class KinematicsFusedFunction(Function):
@staticmethod
def _call_cuda(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
n_spheres = robot_sphere_out.shape[1]
r = kinematics_fused_cu.forward(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
b_size,
n_spheres,
False,
)
out_link_pos = r[0]
out_link_quat = r[1]
out_spheres = r[2]
global_cumul_mat = r[3]
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
@staticmethod
def _call_backward_cuda(
grad_out,
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
sparsity_opt=True,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
n_spheres = robot_sphere_out.shape[1]
if grad_out.is_contiguous():
grad_out = grad_out.view(-1)
else:
grad_out = grad_out.reshape(-1)
r = kinematics_fused_cu.backward(
grad_out,
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
b_size,
n_spheres,
sparsity_opt,
False,
)
out_q = r[0].view(b_size, -1)
return out_q
@staticmethod
def forward(
ctx,
# link_mats: torch.Tensor,
link_pos: torch.Tensor,
link_quat: torch.Tensor,
b_robot_spheres: torch.tensor,
global_cumul_mat: torch.Tensor,
joint_seq: torch.Tensor,
fixed_transform: torch.tensor,
robot_spheres: torch.tensor,
link_map: torch.tensor,
joint_map: torch.Tensor,
joint_map_type: torch.Tensor,
store_link_map: torch.Tensor,
link_sphere_map: torch.Tensor,
link_chain_map: torch.Tensor,
grad_out: torch.Tensor,
):
link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
link_pos,
link_quat,
b_robot_spheres,
global_cumul_mat,
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
)
ctx.save_for_backward(
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
grad_out,
global_cumul_mat,
)
return link_pos, link_quat, b_robot_spheres
@staticmethod
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
grad_joint = None
if ctx.needs_input_grad[4]:
(
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
grad_out,
global_cumul_mat,
) = ctx.saved_tensors
grad_joint = KinematicsFusedFunction._call_backward_cuda(
grad_out,
grad_out_link_pos,
grad_out_link_quat,
grad_out_spheres,
global_cumul_mat,
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
True,
)
return (
None,
None,
None,
None,
grad_joint,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
class KinematicsFusedGlobalCumulFunction(Function):
@staticmethod
def _call_cuda(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
n_spheres = robot_sphere_out.shape[1]
# print(n_spheres)
# print(angle)
r = kinematics_fused_cu.forward(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
b_size,
n_spheres,
True,
)
out_link_pos = r[0]
out_link_quat = r[1]
out_spheres = r[2]
global_cumul_mat = r[3]
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
@staticmethod
def _call_backward_cuda(
grad_out,
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
sparsity_opt=True,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
# b_size = link_mat_out.shape[0]
n_spheres = robot_sphere_out.shape[1]
if grad_out.is_contiguous():
grad_out = grad_out.view(-1)
else:
grad_out = grad_out.reshape(-1)
# create backward tensors?
r = kinematics_fused_cu.backward(
grad_out,
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
b_size,
n_spheres,
sparsity_opt,
True,
)
out_q = r[0] # .view(angle.shape)
out_q = out_q.view(b_size, -1)
return out_q
@staticmethod
def forward(
ctx,
# link_mats: torch.Tensor,
link_pos: torch.Tensor,
link_quat: torch.Tensor,
b_robot_spheres: torch.tensor,
global_cumul_mat: torch.Tensor,
joint_seq: torch.Tensor,
fixed_transform: torch.tensor,
robot_spheres: torch.tensor,
link_map: torch.tensor,
joint_map: torch.Tensor,
joint_map_type: torch.Tensor,
store_link_map: torch.Tensor,
link_sphere_map: torch.Tensor,
link_chain_map: torch.Tensor,
grad_out: torch.Tensor,
):
if joint_seq.is_contiguous():
joint_seq = joint_seq.view(-1)
else:
joint_seq = joint_seq.reshape(-1)
(
link_pos,
link_quat,
b_robot_spheres,
global_cumul_mat,
) = KinematicsFusedGlobalCumulFunction._call_cuda(
link_pos,
link_quat,
b_robot_spheres,
global_cumul_mat,
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
)
ctx.save_for_backward(
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
grad_out,
global_cumul_mat,
)
return link_pos, link_quat, b_robot_spheres
@staticmethod
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
grad_joint = None
if ctx.needs_input_grad[4]:
(
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
grad_out,
global_cumul_mat,
) = ctx.saved_tensors
grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
grad_out,
grad_out_link_pos,
grad_out_link_quat,
grad_out_spheres,
global_cumul_mat,
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
True,
)
return (
None,
None,
None,
None,
grad_joint,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
def get_cuda_kinematics(
link_pos_seq,
link_quat_seq,
batch_robot_spheres,
global_cumul_mat,
q_in,
fixed_transform,
link_spheres_tensor,
link_map, # tells which link is attached to which link i
joint_map, # tells which joint is attached to a link i
joint_map_type, # joint type
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
grad_out_q,
use_global_cumul: bool = True,
):
if use_global_cumul:
link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
link_pos_seq,
link_quat_seq,
batch_robot_spheres,
global_cumul_mat,
q_in,
fixed_transform,
link_spheres_tensor,
link_map, # tells which link is attached to which link i
joint_map, # tells which joint is attached to a link i
joint_map_type, # joint type
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
grad_out_q,
)
else:
link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
link_pos_seq,
link_quat_seq,
batch_robot_spheres,
global_cumul_mat,
q_in,
fixed_transform,
link_spheres_tensor,
link_map, # tells which link is attached to which link i
joint_map, # tells which joint is attached to a link i
joint_map_type, # joint type
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
grad_out_q,
)
return link_pos, link_quat, robot_spheres

112
src/curobo/curobolib/ls.py Normal file
View File

@@ -0,0 +1,112 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Third Party
import torch
# CuRobo
from curobo.util.logger import log_warn
try:
# CuRobo
from curobo.curobolib import line_search_cu
except ImportError:
log_warn("line_search_cu not found, JIT compiling...")
# Third Party
from torch.utils.cpp_extension import load
# CuRobo
from curobo.curobolib.util_file import add_cpp_path
line_search_cu = load(
name="line_search_cu",
sources=add_cpp_path(
[
"line_search_cuda.cpp",
"line_search_kernel.cu",
"update_best_kernel.cu",
]
),
)
def wolfe_line_search(
# m_idx,
best_x,
best_c,
best_grad,
g_x,
x_set,
sv,
c,
c_idx,
c_1: float,
c_2: float,
al,
sw: bool,
aw: bool,
):
batchsize = g_x.shape[0]
l1 = g_x.shape[1]
l2 = g_x.shape[2]
r = line_search_cu.line_search(
# m_idx,
best_x,
best_c,
best_grad,
g_x,
x_set,
sv,
c,
al,
c_idx,
c_1,
c_2,
sw,
aw,
l1,
l2,
batchsize,
)
# print("batchsize:" + str(batchsize))
return (r[0], r[1], r[2])
def update_best(
best_cost,
best_q,
best_iteration,
current_iteration,
cost,
q,
d_opt: int,
iteration: int,
delta_threshold: float = 1e-5,
relative_threshold: float = 0.999,
):
cost_s1 = cost.shape[0]
cost_s2 = cost.shape[1]
r = line_search_cu.update_best(
best_cost,
best_q,
best_iteration,
current_iteration,
cost,
q,
d_opt,
cost_s1,
cost_s2,
iteration,
delta_threshold,
relative_threshold,
)
# print("batchsize:" + str(batchsize))
return (r[0], r[1], r[2]) # output: best_cost, best_q, best_iteration

112
src/curobo/curobolib/opt.py Normal file
View File

@@ -0,0 +1,112 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Third Party
import torch
from torch.autograd import Function
# CuRobo
from curobo.util.logger import log_warn
try:
# CuRobo
from curobo.curobolib import lbfgs_step_cu
except ImportError:
log_warn("lbfgs_step_cu not found, JIT compiling...")
# Third Party
from torch.utils.cpp_extension import load
# CuRobo
from curobo.curobolib.util_file import add_cpp_path
lbfgs_step_cu = load(
name="lbfgs_step_cu",
sources=add_cpp_path(
[
"lbfgs_step_cuda.cpp",
"lbfgs_step_kernel.cu",
]
),
)
class LBFGScu(Function):
@staticmethod
def _call_cuda(
step_vec,
rho_buffer,
y_buffer,
s_buffer,
q,
grad_q,
x_0,
grad_0,
epsilon=0.1,
stable_mode=False,
):
m, b, v_dim, _ = y_buffer.shape
R = lbfgs_step_cu.forward(
step_vec, # .view(-1),
rho_buffer, # .view(-1),
y_buffer, # .view(-1),
s_buffer, # .view(-1),
q,
grad_q, # .view(-1),
x_0,
grad_0,
epsilon,
b,
m,
v_dim,
stable_mode,
)
step_v = R[0].view(step_vec.shape)
return step_v
@staticmethod
def forward(
ctx,
step_vec,
rho_buffer,
y_buffer,
s_buffer,
q,
grad_q,
x_0,
grad_0,
epsilon=0.1,
stable_mode=False,
):
R = LBFGScu._call_cuda(
step_vec,
rho_buffer,
y_buffer,
s_buffer,
q,
grad_q,
x_0,
grad_0,
epsilon=epsilon,
stable_mode=stable_mode,
)
# ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
return R
@staticmethod
def backward(ctx, grad_output):
return (
None,
None,
None,
None,
None,
)

View File

@@ -0,0 +1,195 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Third Party
import torch
# CuRobo
from curobo.util.logger import log_warn
try:
# CuRobo
from curobo.curobolib import tensor_step_cu
except ImportError:
# Third Party
from torch.utils.cpp_extension import load
# CuRobo
from curobo.curobolib.util_file import add_cpp_path
log_warn("tensor_step_cu not found, jit compiling...")
tensor_step_cu = load(
name="tensor_step_cu",
sources=add_cpp_path(["tensor_step_cuda.cpp", "tensor_step_kernel.cu"]),
)
def tensor_step_pos_clique_idx_fwd(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_position,
start_position,
start_velocity,
start_acceleration,
start_idx,
traj_dt,
batch_size,
horizon,
dof,
mode=-1,
):
r = tensor_step_cu.step_idx_position2(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_position,
start_position,
start_velocity,
start_acceleration,
start_idx,
traj_dt,
batch_size,
horizon,
dof,
mode,
)
return (r[0], r[1], r[2], r[3])
def tensor_step_pos_clique_fwd(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_position,
start_position,
start_velocity,
start_acceleration,
traj_dt,
batch_size,
horizon,
dof,
mode=-1,
):
r = tensor_step_cu.step_position2(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_position,
start_position,
start_velocity,
start_acceleration,
traj_dt,
batch_size,
horizon,
dof,
mode,
)
return (r[0], r[1], r[2], r[3])
def tensor_step_acc_fwd(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_acc,
start_position,
start_velocity,
start_acceleration,
traj_dt,
batch_size,
horizon,
dof,
use_rk2=True,
):
r = tensor_step_cu.step_acceleration(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_acc,
start_position,
start_velocity,
start_acceleration,
traj_dt,
batch_size,
horizon,
dof,
use_rk2,
)
return (r[0], r[1], r[2], r[3]) # output: best_cost, best_q, best_iteration
def tensor_step_acc_idx_fwd(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_acc,
start_position,
start_velocity,
start_acceleration,
start_idx,
traj_dt,
batch_size,
horizon,
dof,
use_rk2=True,
):
r = tensor_step_cu.step_acceleration_idx(
out_position,
out_velocity,
out_acceleration,
out_jerk,
u_acc,
start_position,
start_velocity,
start_acceleration,
start_idx,
traj_dt,
batch_size,
horizon,
dof,
use_rk2,
)
return (r[0], r[1], r[2], r[3]) # output: best_cost, best_q, best_iteration
def tensor_step_pos_clique_bwd(
out_grad_position,
grad_position,
grad_velocity,
grad_acceleration,
grad_jerk,
traj_dt,
batch_size,
horizon,
dof,
mode=-1,
):
r = tensor_step_cu.step_position_backward2(
out_grad_position,
grad_position,
grad_velocity,
grad_acceleration,
grad_jerk,
traj_dt,
batch_size,
horizon,
dof,
mode,
)
return r[0]

View File

@@ -0,0 +1,34 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Standard Library
import os
def get_cpp_path():
path = os.path.dirname(__file__)
return os.path.join(path, "cpp")
def join_path(path1, path2):
if isinstance(path2, str):
return os.path.join(path1, path2)
else:
return path2
def add_cpp_path(sources):
cpp_path = get_cpp_path()
new_list = []
for s in sources:
s = join_path(cpp_path, s)
new_list.append(s)
return new_list