kinematics refactor with mimic joint support

This commit is contained in:
Balakumar Sundaralingam
2024-04-03 18:42:01 -07:00
parent b481ee201a
commit 774dcfd609
60 changed files with 2177 additions and 810 deletions

View File

@@ -0,0 +1,21 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_FP8 defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
#define CHECK_INPUT_GUARD(x) CHECK_INPUT(x); const at::cuda::OptionalCUDAGuard guard(x.device())

View File

@@ -0,0 +1,17 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#include "check_cuda.h"
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#if CHECK_FP8
#include <cuda_fp8.h>
#endif

View File

@@ -13,7 +13,7 @@
#include <c10/cuda/CUDAGuard.h>
#include <vector>
#include "check_cuda.h"
// CUDA forward declarations
std::vector<torch::Tensor>self_collision_distance(
@@ -168,13 +168,6 @@ backward_pose_distance(torch::Tensor out_grad_p,
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor>self_collision_distance_wrapper(
torch::Tensor out_distance, torch::Tensor out_vec,

View File

@@ -13,6 +13,8 @@
#include <c10/cuda/CUDAGuard.h>
#include <vector>
#include "check_cuda.h"
// CUDA forward declarations
std::vector<torch::Tensor>
@@ -33,7 +35,9 @@ std::vector<torch::Tensor>kin_fused_forward(
const torch::Tensor joint_map_type,
const torch::Tensor store_link_map,
const torch::Tensor link_sphere_map,
const torch::Tensor joint_offset_map,
const int batch_size,
const int n_joints,
const int n_spheres,
const bool use_global_cumul = false);
@@ -52,76 +56,20 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
const torch::Tensor store_link_map,
const torch::Tensor link_sphere_map,
const torch::Tensor link_chain_map,
const torch::Tensor joint_offset_map,
const int batch_size,
const int n_joints,
const int n_spheres,
const bool sparsity_opt = true,
const bool use_global_cumul = false);
// C++ interface
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
std::vector<torch::Tensor>kin_forward_wrapper(
torch::Tensor link_pos, torch::Tensor link_quat,
torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
const torch::Tensor robot_spheres, const torch::Tensor link_map,
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
const int batch_size, const int n_spheres,
const bool use_global_cumul = false)
{
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
// TODO: add check input
return kin_fused_forward(
link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
}
std::vector<torch::Tensor>kin_backward_wrapper(
torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
const torch::Tensor link_map, const torch::Tensor joint_map,
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
const bool use_global_cumul = false)
{
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
return kin_fused_backward_16t(
grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
joint_map, joint_map_type, store_link_map, link_sphere_map,
link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
}
std::vector<torch::Tensor>
matrix_to_quaternion_wrapper(torch::Tensor out_quat,
const torch::Tensor in_rot // batch_size, 3
)
{
const at::cuda::OptionalCUDAGuard guard(in_rot.device());
CHECK_INPUT(in_rot);
CHECK_INPUT(out_quat);
return matrix_to_quaternion(out_quat, in_rot);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
m.def("forward", &kin_fused_forward, "Kinematics fused forward (CUDA)");
m.def("backward", &kin_fused_backward_16t, "Kinematics fused backward (CUDA)");
m.def("matrix_to_quaternion", &matrix_to_quaternion,
"Rotation Matrix to Quaternion (CUDA)");
}

View File

@@ -10,12 +10,13 @@
*/
#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include "helper_math.h"
#include <cuda_fp16.h>
#include "check_cuda.h"
#include <vector>
#define M 4
@@ -28,13 +29,6 @@
#define Y_ROT 4
#define Z_ROT 5
#define X_PRISM_NEG 6
#define Y_PRISM_NEG 7
#define Z_PRISM_NEG 8
#define X_ROT_NEG 9
#define Y_ROT_NEG 10
#define Z_ROT_NEG 11
#define MAX_BATCH_PER_BLOCK 32 // tunable parameter for improving occupancy
#define MAX_BW_BATCH_PER_BLOCK 16 // tunable parameter for improving occupancy
@@ -46,6 +40,8 @@ namespace Curobo
{
namespace Kinematics
{
template<typename psum_t>
__device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
float3 scale, psum_t& sum_out)
@@ -293,36 +289,11 @@ namespace Curobo
}
}
__device__ __forceinline__ void update_joint_type_direction(int& j_type, int8_t& axis_sign)
{
// Assume that input j_type >= 0 . Check fixed joint outside of this function.
// Don't do anything if j_type < 6. j_type range is [0, 11]
// divergence here.
axis_sign = 1;
if (j_type >= 6)
{
j_type -= 6;
axis_sign = -1;
}
}
__device__ __forceinline__ void update_joint_type_direction(int& j_type)
{
// Assume that input j_type >= 0 . Check fixed joint outside of this function.
// Don't do anything if j_type < 6. j_type range is [0, 11]
// divergence here.
if (j_type >= 6)
{
j_type -= 6;
}
}
__device__ __forceinline__ void update_axis_direction(
float& angle,
int & j_type)
int & j_type,
const float2 &j_offset)
{
// Assume that input j_type >= 0 . Check fixed joint outside of this function.
// sign should be +ve <= 5 and -ve >5
@@ -330,9 +301,7 @@ namespace Curobo
// cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 =
// +ve,
// then this code should be j_type - 5.
angle = -1 * copysignf(1.0, j_type - 6) * angle;
update_joint_type_direction(j_type);
angle = j_offset.x * angle + j_offset.y;
}
// In the following versions of rot_fn, some non-nan values may become nan as we
@@ -462,7 +431,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
rot_backward_translation(const float3& vec, float *cumul_mat, float *l_pos,
const float3& loc_grad, psum_t& grad_q, const int8_t axis_sign = 1)
const float3& loc_grad, psum_t& grad_q, const float axis_sign = 1)
{
float3 e_pos, j_pos;
@@ -481,7 +450,7 @@ namespace Curobo
rot_backward_rotation(const float3 vec,
const float3 grad_vec,
psum_t & grad_q,
const int8_t axis_sign = 1)
const float axis_sign = 1)
{
grad_q += axis_sign * dot(vec, grad_vec);
}
@@ -489,7 +458,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
prism_backward_translation(const float3 vec, const float3 grad_vec,
psum_t& grad_q, const int8_t axis_sign = 1)
psum_t& grad_q, const float axis_sign = 1)
{
grad_q += axis_sign * dot(vec, grad_vec);
}
@@ -497,7 +466,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
z_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
{
float3 vec =
make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
@@ -512,7 +481,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
x_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
{
float3 vec =
make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
@@ -527,7 +496,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
y_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
{
float3 vec =
make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
@@ -542,7 +511,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
xyz_prism_backward_translation(float *cumul_mat, float3& loc_grad,
psum_t& grad_q, int xyz, const int8_t axis_sign = 1)
psum_t& grad_q, int xyz, const float axis_sign = 1)
{
prism_backward_translation(
make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
@@ -553,7 +522,7 @@ namespace Curobo
__device__ __forceinline__ void x_prism_backward_translation(float *cumul_mat,
float3 & loc_grad,
psum_t & grad_q,
const int8_t axis_sign = 1)
const float axis_sign = 1)
{
// get rotation vector:
prism_backward_translation(
@@ -564,7 +533,7 @@ namespace Curobo
__device__ __forceinline__ void y_prism_backward_translation(float *cumul_mat,
float3 & loc_grad,
psum_t & grad_q,
const int8_t axis_sign = 1)
const float axis_sign = 1)
{
// get rotation vector:
prism_backward_translation(
@@ -575,16 +544,15 @@ namespace Curobo
__device__ __forceinline__ void z_prism_backward_translation(float *cumul_mat,
float3 & loc_grad,
psum_t & grad_q,
const int8_t axis_sign = 1)
const float axis_sign = 1)
{
// get rotation vector:
prism_backward_translation(
make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
}
__device__ __forceinline__ void
xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
float& grad_q, int xyz, const int8_t axis_sign = 1)
float& grad_q, int xyz, const float axis_sign = 1)
{
// get rotation vector:
rot_backward_translation(
@@ -592,10 +560,11 @@ namespace Curobo
&cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
}
template<typename psum_t>
__device__ __forceinline__ void
x_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
psum_t& grad_q, const int8_t axis_sign = 1)
psum_t& grad_q, const float axis_sign = 1)
{
// get rotation vector:
rot_backward_translation(
@@ -606,7 +575,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
y_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
psum_t& grad_q, const int8_t axis_sign = 1)
psum_t& grad_q, const float axis_sign = 1)
{
// get rotation vector:
rot_backward_translation(
@@ -617,7 +586,7 @@ namespace Curobo
template<typename psum_t>
__device__ __forceinline__ void
z_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
psum_t& grad_q, const int8_t axis_sign = 1)
psum_t& grad_q, const float axis_sign = 1)
{
// get rotation vector:
rot_backward_translation(
@@ -629,18 +598,19 @@ namespace Curobo
// This one should be about 10% faster.
template<typename scalar_t, bool use_global_cumul>
__global__ void
kin_fused_warp_kernel2(scalar_t *link_pos, // batchSize xz store_n_links x M x M
scalar_t *link_quat, // batchSize x store_n_links x M x M
kin_fused_warp_kernel2(float *link_pos, // batchSize xz store_n_links x M x M
float *link_quat, // batchSize x store_n_links x M x M
scalar_t *b_robot_spheres, // batchSize x nspheres x M
scalar_t *global_cumul_mat,
const scalar_t *q, // batchSize x njoints
const scalar_t *fixedTransform, // nlinks x M x M
const scalar_t *robot_spheres, // nspheres x M
float *global_cumul_mat, // batchSize x nlinks x M x M
const float *q, // batchSize x njoints
const float *fixedTransform, // nlinks x M x M
const float *robot_spheres, // nspheres x M
const int8_t *jointMapType, // nlinks
const int16_t *jointMap, // nlinks
const int16_t *linkMap, // nlinks
const int16_t *storeLinkMap, // store_n_links
const int16_t *linkSphereMap, // nspheres
const float *jointOffset, // nlinks
const int batchSize, const int nspheres,
const int nlinks, const int njoints,
const int store_n_links)
@@ -688,7 +658,8 @@ namespace Curobo
else
{
float angle = q[batch * njoints + jointMap[l]];
update_axis_direction(angle, j_type);
float2 angle_offset = *(float2 *)&jointOffset[l*2];
update_axis_direction(angle, j_type, angle_offset);
if (j_type <= Z_PRISM)
{
@@ -794,20 +765,21 @@ namespace Curobo
template<typename scalar_t, typename psum_t, bool use_global_cumul,
bool enable_sparsity_opt, int16_t MAX_JOINTS, bool PARALLEL_WRITE>
__global__ void kin_fused_backward_kernel3(
scalar_t *grad_out_link_q, // batchSize * njoints
const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
const scalar_t *grad_nlinks_quat,
float *grad_out_link_q, // batchSize * njoints
const float *grad_nlinks_pos, // batchSize * store_n_links * 16
const float *grad_nlinks_quat,
const scalar_t *grad_spheres, // batchSize * nspheres * 4
const scalar_t *global_cumul_mat,
const scalar_t *q, // batchSize * njoints
const scalar_t *fixedTransform, // nlinks * 16
const scalar_t *robotSpheres, // batchSize * nspheres * 4
const float *global_cumul_mat,
const float *q, // batchSize * njoints
const float *fixedTransform, // nlinks * 16
const float *robotSpheres, // batchSize * nspheres * 4
const int8_t *jointMapType, // nlinks
const int16_t *jointMap, // nlinks
const int16_t *linkMap, // nlinks
const int16_t *storeLinkMap, // store_n_links
const int16_t *linkSphereMap, // nspheres
const int16_t *linkChainMap, // nlinks*nlinks
const float *jointOffset, // nlinks*2
const int batchSize, const int nspheres, const int nlinks,
const int njoints, const int store_n_links)
{
@@ -819,8 +791,7 @@ namespace Curobo
if (batch >= batchSize)
return;
// Each thread computes one element of the cumul_mat.
// Each thread computes one element of the cumul_mat.
// first 4 threads compute a row of the output;
const int elem_idx = threadIdx.x % 16;
const int col_idx = elem_idx % 4;
@@ -832,6 +803,7 @@ namespace Curobo
for (int l = 0; l < nlinks; l++)
{
int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
cumul_mat[outAddrStart + elem_idx] =
global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
}
@@ -857,8 +829,8 @@ namespace Curobo
else
{
float angle = q[batch * njoints + jointMap[l]];
update_axis_direction(angle, j_type);
float2 angle_offset = *(float2 *)&jointOffset[l*2];
update_axis_direction(angle, j_type, angle_offset);
if (j_type <= Z_PRISM)
{
@@ -949,15 +921,11 @@ namespace Curobo
{
continue;
}
int8_t axis_sign = 1;
float axis_sign = jointOffset[j*2];
int j_type = jointMapType[j];
if (j_type != FIXED)
{
update_joint_type_direction(j_type, axis_sign);
}
if (j_type == Z_ROT)
{
float result = 0.0;
@@ -1046,12 +1014,13 @@ namespace Curobo
// for (int16_t k = joints_per_thread; k >= 0; k--)
for (int16_t k = 0; k < joints_per_thread; k++)
{
// int16_t j = elem_idx * joints_per_thread + k;
int16_t j = elem_idx + k * 16;
int16_t j = elem_idx * joints_per_thread + k;
//int16_t j = elem_idx + k * 16;
// int16_t j = elem_idx + k * 16; // (threadidx.x % 16) + k * 16 (0 to 16)
// int16_t j = k * M + elem_idx;
if ((j > max_lmap) || (j < 0))
continue;
if ((j > max_lmap))
break;
// This can be spread across threads as they are not sequential?
if (linkChainMap[l_map * nlinks + j] == 0.0)
@@ -1061,12 +1030,8 @@ namespace Curobo
int16_t j_idx = jointMap[j];
int j_type = jointMapType[j];
int8_t axis_sign = 1;
float axis_sign = jointOffset[j*2];
if (j_type != FIXED)
{
update_joint_type_direction(j_type, axis_sign);
}
// get rotation vector:
if (j_type == Z_ROT)
@@ -1090,8 +1055,9 @@ namespace Curobo
g_position, g_orientation, psum_grad[j_idx], axis_sign);
}
}
}
}
__syncthreads();
if (PARALLEL_WRITE)
{
// accumulate the partial sums across the 16 threads
@@ -1114,8 +1080,8 @@ namespace Curobo
for (int16_t j = 0; j < joints_per_thread; j++)
{
// const int16_t j_idx = elem_idx * joints_per_thread + j;
const int16_t j_idx = elem_idx + j * 16;
const int16_t j_idx = elem_idx * joints_per_thread + j;
//const int16_t j_idx = elem_idx + j * 16;
if (j_idx >= njoints)
{
@@ -1151,7 +1117,7 @@ namespace Curobo
{
{
grad_out_link_q[batch * njoints + j] =
psum_grad[j]; // write the sum to memory
(float) psum_grad[j]; // write the sum to memory
}
}
}
@@ -1195,11 +1161,27 @@ std::vector<torch::Tensor>kin_fused_forward(
const torch::Tensor robot_spheres, const torch::Tensor link_map,
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
const int batch_size, const int n_spheres,
const torch::Tensor joint_offset_map,
const int batch_size,
const int n_joints,const int n_spheres,
const bool use_global_cumul = false)
{
using namespace Curobo::Kinematics;
const int n_joints = joint_vec.size(0) / batch_size;
CHECK_INPUT_GUARD(joint_vec);
CHECK_INPUT(link_pos);
CHECK_INPUT(link_quat);
CHECK_INPUT(global_cumul_mat);
CHECK_INPUT(batch_robot_spheres);
CHECK_INPUT(fixed_transform);
CHECK_INPUT(robot_spheres);
CHECK_INPUT(link_map);
CHECK_INPUT(joint_map);
CHECK_INPUT(joint_map_type);
CHECK_INPUT(store_link_map);
CHECK_INPUT(link_sphere_map);
//CHECK_INPUT(link_chain_map);
CHECK_INPUT(joint_offset_map);
const int n_links = link_map.size(0);
const int store_n_links = link_pos.size(1);
assert(joint_map.dtype() == torch::kInt16);
@@ -1238,38 +1220,42 @@ std::vector<torch::Tensor>kin_fused_forward(
if (use_global_cumul)
{
AT_DISPATCH_FLOATING_TYPES(
link_pos.scalar_type(), "kin_fused_forward", ([&] {
batch_robot_spheres.scalar_type(), "kin_fused_forward", ([&] {
kin_fused_warp_kernel2<scalar_t, true>
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
link_pos.data_ptr<float>(), link_quat.data_ptr<float>(),
batch_robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<scalar_t>(),
joint_vec.data_ptr<scalar_t>(),
fixed_transform.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<float>(),
joint_vec.data_ptr<float>(),
fixed_transform.data_ptr<float>(),
robot_spheres.data_ptr<float>(),
joint_map_type.data_ptr<int8_t>(),
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
store_link_map.data_ptr<int16_t>(),
link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
link_sphere_map.data_ptr<int16_t>(),
joint_offset_map.data_ptr<float>(),
batch_size, n_spheres,
n_links, n_joints, store_n_links);
}));
}
else
{
AT_DISPATCH_FLOATING_TYPES(
link_pos.scalar_type(), "kin_fused_forward", ([&] {
batch_robot_spheres.scalar_type(), "kin_fused_forward", ([&] {
kin_fused_warp_kernel2<scalar_t, false>
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
link_pos.data_ptr<float>(), link_quat.data_ptr<float>(),
batch_robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<scalar_t>(),
joint_vec.data_ptr<scalar_t>(),
fixed_transform.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<float>(),
joint_vec.data_ptr<float>(),
fixed_transform.data_ptr<float>(),
robot_spheres.data_ptr<float>(),
joint_map_type.data_ptr<int8_t>(),
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
store_link_map.data_ptr<int16_t>(),
link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
link_sphere_map.data_ptr<int16_t>(),
joint_offset_map.data_ptr<float>(),
batch_size, n_spheres,
n_links, n_joints, store_n_links);
}));
}
@@ -1292,15 +1278,29 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
const torch::Tensor link_map, const torch::Tensor joint_map,
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
const torch::Tensor joint_offset_map,
const int batch_size, const int n_joints, const int n_spheres, const bool sparsity_opt = true,
const bool use_global_cumul = false)
{
using namespace Curobo::Kinematics;
const int n_joints = joint_vec.size(0) / batch_size;
CHECK_INPUT_GUARD(joint_vec);
CHECK_INPUT(grad_out);
CHECK_INPUT(grad_nlinks_pos);
CHECK_INPUT(grad_nlinks_quat);
CHECK_INPUT(global_cumul_mat);
CHECK_INPUT(fixed_transform);
CHECK_INPUT(robot_spheres);
CHECK_INPUT(link_map);
CHECK_INPUT(joint_map);
CHECK_INPUT(joint_map_type);
CHECK_INPUT(store_link_map);
CHECK_INPUT(link_sphere_map);
CHECK_INPUT(link_chain_map);
CHECK_INPUT(joint_offset_map);
const int n_links = link_map.size(0);
const int store_n_links = store_link_map.size(0);
// assert(n_links < 128);
assert(n_joints < 128); // for larger num. of joints, change kernel3's
// MAX_JOINTS template value.
@@ -1341,72 +1341,77 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
assert(sparsity_opt);
if (use_global_cumul)
{
if (n_joints < 16)
{
AT_DISPATCH_FLOATING_TYPES(
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, float, true, true, 16, true>
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, double, true, true, 16, true>
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
grad_out.data_ptr<scalar_t>(),
grad_nlinks_pos.data_ptr<scalar_t>(),
grad_nlinks_quat.data_ptr<scalar_t>(),
grad_out.data_ptr<float>(),
grad_nlinks_pos.data_ptr<float>(),
grad_nlinks_quat.data_ptr<float>(),
grad_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<scalar_t>(),
joint_vec.data_ptr<scalar_t>(),
fixed_transform.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<float>(),
joint_vec.data_ptr<float>(),
fixed_transform.data_ptr<float>(),
robot_spheres.data_ptr<float>(),
joint_map_type.data_ptr<int8_t>(),
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
store_link_map.data_ptr<int16_t>(),
link_sphere_map.data_ptr<int16_t>(),
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
link_chain_map.data_ptr<int16_t>(),
joint_offset_map.data_ptr<float>(),
batch_size, n_spheres,
n_links, n_joints, store_n_links);
}));
}
else if (n_joints < 64)
{
AT_DISPATCH_FLOATING_TYPES(
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, float, true, true, 64, true>
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, double, true, true, 64, true>
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
grad_out.data_ptr<scalar_t>(),
grad_nlinks_pos.data_ptr<scalar_t>(),
grad_nlinks_quat.data_ptr<scalar_t>(),
grad_out.data_ptr<float>(),
grad_nlinks_pos.data_ptr<float>(),
grad_nlinks_quat.data_ptr<float>(),
grad_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<scalar_t>(),
joint_vec.data_ptr<scalar_t>(),
fixed_transform.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<float>(),
joint_vec.data_ptr<float>(),
fixed_transform.data_ptr<float>(),
robot_spheres.data_ptr<float>(),
joint_map_type.data_ptr<int8_t>(),
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
store_link_map.data_ptr<int16_t>(),
link_sphere_map.data_ptr<int16_t>(),
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
link_chain_map.data_ptr<int16_t>(),
joint_offset_map.data_ptr<float>(),
batch_size, n_spheres,
n_links, n_joints, store_n_links);
}));
}
else
{
AT_DISPATCH_FLOATING_TYPES(
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, float, true, true, 128, true>
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, double, true, true, 128, true>
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
grad_out.data_ptr<scalar_t>(),
grad_nlinks_pos.data_ptr<scalar_t>(),
grad_nlinks_quat.data_ptr<scalar_t>(),
grad_out.data_ptr<float>(),
grad_nlinks_pos.data_ptr<float>(),
grad_nlinks_quat.data_ptr<float>(),
grad_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<scalar_t>(),
joint_vec.data_ptr<scalar_t>(),
fixed_transform.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<float>(),
joint_vec.data_ptr<float>(),
fixed_transform.data_ptr<float>(),
robot_spheres.data_ptr<float>(),
joint_map_type.data_ptr<int8_t>(),
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
store_link_map.data_ptr<int16_t>(),
link_sphere_map.data_ptr<int16_t>(),
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
link_chain_map.data_ptr<int16_t>(),
joint_offset_map.data_ptr<float>(),
batch_size, n_spheres,
n_links, n_joints, store_n_links);
}));
}
@@ -1417,22 +1422,25 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
{
//
AT_DISPATCH_FLOATING_TYPES(
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, float, false, true, 128, true>
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
kin_fused_backward_kernel3<scalar_t, double, false, true, 128, true>
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
grad_out.data_ptr<scalar_t>(),
grad_nlinks_pos.data_ptr<scalar_t>(),
grad_nlinks_quat.data_ptr<scalar_t>(),
grad_out.data_ptr<float>(),
grad_nlinks_pos.data_ptr<float>(),
grad_nlinks_quat.data_ptr<float>(),
grad_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<scalar_t>(),
joint_vec.data_ptr<scalar_t>(),
fixed_transform.data_ptr<scalar_t>(),
robot_spheres.data_ptr<scalar_t>(),
global_cumul_mat.data_ptr<float>(),
joint_vec.data_ptr<float>(),
fixed_transform.data_ptr<float>(),
robot_spheres.data_ptr<float>(),
joint_map_type.data_ptr<int8_t>(),
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
joint_map.data_ptr<int16_t>(),
link_map.data_ptr<int16_t>(),
store_link_map.data_ptr<int16_t>(),
link_sphere_map.data_ptr<int16_t>(),
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
link_chain_map.data_ptr<int16_t>(),
joint_offset_map.data_ptr<float>(),
batch_size, n_spheres,
n_links, n_joints, store_n_links);
}));
}
@@ -1447,7 +1455,8 @@ matrix_to_quaternion(torch::Tensor out_quat,
)
{
using namespace Curobo::Kinematics;
CHECK_INPUT(out_quat);
CHECK_INPUT_GUARD(in_rot);
// we compute the warp threads based on number of boxes:
// TODO: verify this math

View File

@@ -807,6 +807,7 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
{
using namespace Curobo::Optimization;
// call first kernel:
//const bool use_experimental = true;
const bool use_fixed_m = true;

View File

@@ -15,23 +15,13 @@
#include <torch/extension.h>
#include "helper_math.h"
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <vector>
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
#include "check_cuda.h"
#include "cuda_precisions.h"
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
#include <cuda_fp8.h>
#endif
#define M 4
#define VOXEL_DEBUG true
#define VOXEL_UNOBSERVED_DISTANCE -1000.0
@@ -63,8 +53,7 @@ namespace Curobo
}
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
#if CHECK_FP8
__device__ __forceinline__ float
get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx)
{
@@ -704,16 +693,21 @@ float4 &sum_pt)
int3 &xyz_grid,
float &interpolated_distance,
int &voxel_idx)
{
{
// convert location to index: can use floor to cast to int.
// to account for negative values, add 0.5 * bounds.
const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z);
const float3 sphere = make_float3(loc_sphere.x, loc_sphere.y, loc_sphere.z);
// xyz_loc = make_int3(floorf((sphere + 0.5 * loc_grid) / loc_grid_params.w));
xyz_loc = make_int3(floorf((sphere) / loc_grid_params.w) + (0.5 * loc_grid/loc_grid_params.w));
xyz_grid = make_int3(floorf(loc_grid / loc_grid_params.w));
//xyz_loc = make_int3(floorf((sphere + 0.5 * loc_grid) / loc_grid_params.w));
const float inv_voxel_size = 1.0/loc_grid_params.w;
//xyz_loc = make_int3(sphere * inv_voxel_size) + make_int3(0.5 * loc_grid * inv_voxel_size);
xyz_loc = make_int3((sphere + 0.5 * loc_grid) * inv_voxel_size);
//xyz_loc = make_int3(sphere / loc_grid_params.w) + make_int3(floorf(0.5 * loc_grid/loc_grid_params.w));
xyz_grid = make_int3((loc_grid * inv_voxel_size)) + 1;
// find next nearest voxel to current point and then do weighted interpolation:
voxel_idx = xyz_loc.x * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z;
@@ -732,8 +726,8 @@ float4 &sum_pt)
float3 delta = sphere - voxel_origin;
int3 next_loc = make_int3(floorf((make_float3(xyz_loc) + normalize(delta))));
float ratio = length(delta)/loc_grid_params.w;
int3 next_loc = make_int3(((make_float3(xyz_loc) + normalize(delta))));
float ratio = length(delta) * inv_voxel_size;
int next_voxel_idx = next_loc.x * xyz_grid.y * xyz_grid.z + next_loc.y * xyz_grid.z + next_loc.z;
@@ -2752,7 +2746,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3
else
{
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
#if CHECK_FP8
const auto fp8_type = torch::kFloat8_e4m3fn;
#else
const auto fp8_type = torch::kHalf;
@@ -3217,7 +3211,7 @@ sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
#if CHECK_FP8
const auto fp8_type = torch::kFloat8_e4m3fn;
#else
const auto fp8_type = torch::kHalf;
@@ -3352,7 +3346,7 @@ swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
#if CHECK_FP8
const auto fp8_type = torch::kFloat8_e4m3fn;
#else
const auto fp8_type = torch::kHalf;

View File

@@ -43,100 +43,10 @@ def rotation_matrix_to_quaternion(in_mat, out_quat):
class KinematicsFusedFunction(Function):
@staticmethod
def _call_cuda(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
n_spheres = robot_sphere_out.shape[1]
r = kinematics_fused_cu.forward(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
b_size,
n_spheres,
False,
)
out_link_pos = r[0]
out_link_quat = r[1]
out_spheres = r[2]
global_cumul_mat = r[3]
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
@staticmethod
def _call_backward_cuda(
grad_out,
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
sparsity_opt=True,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
n_spheres = robot_sphere_out.shape[1]
if grad_out.is_contiguous():
grad_out = grad_out.view(-1)
else:
grad_out = grad_out.reshape(-1)
r = kinematics_fused_cu.backward(
grad_out,
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
b_size,
n_spheres,
sparsity_opt,
False,
)
out_q = r[0].view(b_size, -1)
return out_q
@staticmethod
def forward(
ctx,
# link_mats: torch.Tensor,
link_pos: torch.Tensor,
link_quat: torch.Tensor,
b_robot_spheres: torch.tensor,
@@ -150,9 +60,17 @@ class KinematicsFusedFunction(Function):
store_link_map: torch.Tensor,
link_sphere_map: torch.Tensor,
link_chain_map: torch.Tensor,
joint_offset_map: torch.Tensor,
grad_out: torch.Tensor,
use_global_cumul: bool = True,
):
link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
ctx.use_global_cumul = use_global_cumul
b_shape = link_pos.shape
b_size = b_shape[0]
n_spheres = b_robot_spheres.shape[1]
n_joints = joint_seq.shape[-1]
r = kinematics_fused_cu.forward(
link_pos,
link_quat,
b_robot_spheres,
@@ -165,7 +83,17 @@ class KinematicsFusedFunction(Function):
joint_map_type,
store_link_map,
link_sphere_map,
joint_offset_map, # offset_joint_map
b_size,
n_joints,
n_spheres,
True,
)
out_link_pos = r[0]
out_link_quat = r[1]
out_spheres = r[2]
global_cumul_mat = r[3]
ctx.save_for_backward(
joint_seq,
fixed_transform,
@@ -176,11 +104,11 @@ class KinematicsFusedFunction(Function):
store_link_map,
link_sphere_map,
link_chain_map,
joint_offset_map,
grad_out,
global_cumul_mat,
)
return link_pos, link_quat, b_robot_spheres
return out_link_pos, out_link_quat, out_spheres
@staticmethod
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
@@ -197,9 +125,11 @@ class KinematicsFusedFunction(Function):
store_link_map,
link_sphere_map,
link_chain_map,
joint_offset_map,
grad_out,
global_cumul_mat,
) = ctx.saved_tensors
grad_joint = KinematicsFusedFunction._call_backward_cuda(
grad_out,
grad_out_link_pos,
@@ -215,7 +145,9 @@ class KinematicsFusedFunction(Function):
store_link_map,
link_sphere_map,
link_chain_map,
joint_offset_map,
True,
use_global_cumul=ctx.use_global_cumul,
)
return (
@@ -233,53 +165,10 @@ class KinematicsFusedFunction(Function):
None,
None,
None,
None,
None,
)
class KinematicsFusedGlobalCumulFunction(Function):
@staticmethod
def _call_cuda(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
):
b_shape = link_pos_out.shape
b_size = b_shape[0]
n_spheres = robot_sphere_out.shape[1]
# print(n_spheres)
# print(angle)
r = kinematics_fused_cu.forward(
link_pos_out,
link_quat_out,
robot_sphere_out,
global_cumul_mat,
angle,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
b_size,
n_spheres,
True,
)
out_link_pos = r[0]
out_link_quat = r[1]
out_spheres = r[2]
global_cumul_mat = r[3]
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
@staticmethod
def _call_backward_cuda(
grad_out,
@@ -296,18 +185,18 @@ class KinematicsFusedGlobalCumulFunction(Function):
store_link_map,
link_sphere_map,
link_chain_map,
joint_offset_map,
sparsity_opt=True,
use_global_cumul=False,
):
b_shape = link_pos_out.shape
b_shape = grad_out.shape
b_size = b_shape[0]
# b_size = link_mat_out.shape[0]
n_spheres = robot_sphere_out.shape[1]
n_joints = angle.shape[-1]
if grad_out.is_contiguous():
grad_out = grad_out.view(-1)
else:
grad_out = grad_out.reshape(-1)
# create backward tensors?
r = kinematics_fused_cu.backward(
grad_out,
link_pos_out,
@@ -323,126 +212,17 @@ class KinematicsFusedGlobalCumulFunction(Function):
store_link_map,
link_sphere_map,
link_chain_map,
joint_offset_map, # offset_joint_map
b_size,
n_joints,
n_spheres,
sparsity_opt,
True,
use_global_cumul,
)
out_q = r[0] # .view(angle.shape)
out_q = out_q.view(b_size, -1)
out_q = r[0].view(b_size, -1)
return out_q
@staticmethod
def forward(
ctx,
# link_mats: torch.Tensor,
link_pos: torch.Tensor,
link_quat: torch.Tensor,
b_robot_spheres: torch.tensor,
global_cumul_mat: torch.Tensor,
joint_seq: torch.Tensor,
fixed_transform: torch.tensor,
robot_spheres: torch.tensor,
link_map: torch.tensor,
joint_map: torch.Tensor,
joint_map_type: torch.Tensor,
store_link_map: torch.Tensor,
link_sphere_map: torch.Tensor,
link_chain_map: torch.Tensor,
grad_out: torch.Tensor,
):
if joint_seq.is_contiguous():
joint_seq = joint_seq.view(-1)
else:
joint_seq = joint_seq.reshape(-1)
(
link_pos,
link_quat,
b_robot_spheres,
global_cumul_mat,
) = KinematicsFusedGlobalCumulFunction._call_cuda(
link_pos,
link_quat,
b_robot_spheres,
global_cumul_mat,
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
)
ctx.save_for_backward(
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
grad_out,
global_cumul_mat,
)
return link_pos, link_quat, b_robot_spheres
@staticmethod
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
grad_joint = None
if ctx.needs_input_grad[4]:
(
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
grad_out,
global_cumul_mat,
) = ctx.saved_tensors
grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
grad_out,
grad_out_link_pos,
grad_out_link_quat,
grad_out_spheres,
global_cumul_mat,
joint_seq,
fixed_transform,
robot_spheres,
link_map,
joint_map,
joint_map_type,
store_link_map,
link_sphere_map,
link_chain_map,
True,
)
return (
None,
None,
None,
None,
grad_joint,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
def get_cuda_kinematics(
link_pos_seq,
@@ -458,41 +238,28 @@ def get_cuda_kinematics(
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
joint_offset_map,
grad_out_q,
use_global_cumul: bool = True,
):
if use_global_cumul:
link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
link_pos_seq,
link_quat_seq,
batch_robot_spheres,
global_cumul_mat,
q_in,
fixed_transform,
link_spheres_tensor,
link_map, # tells which link is attached to which link i
joint_map, # tells which joint is attached to a link i
joint_map_type, # joint type
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
grad_out_q,
)
else:
link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
link_pos_seq,
link_quat_seq,
batch_robot_spheres,
global_cumul_mat,
q_in,
fixed_transform,
link_spheres_tensor,
link_map, # tells which link is attached to which link i
joint_map, # tells which joint is attached to a link i
joint_map_type, # joint type
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
grad_out_q,
)
# if not q_in.is_contiguous():
# q_in = q_in.contiguous()
link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
link_pos_seq,
link_quat_seq,
batch_robot_spheres,
global_cumul_mat,
q_in,
fixed_transform,
link_spheres_tensor,
link_map, # tells which link is attached to which link i
joint_map, # tells which joint is attached to a link i
joint_map_type, # joint type
store_link_map,
link_sphere_idx_map, # sphere idx map
link_chain_map,
joint_offset_map,
grad_out_q,
use_global_cumul,
)
return link_pos, link_quat, robot_spheres