kinematics refactor with mimic joint support
This commit is contained in:
21
src/curobo/curobolib/cpp/check_cuda.h
Normal file
21
src/curobo/curobolib/cpp/check_cuda.h
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include <torch/extension.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
|
||||
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
|
||||
|
||||
#define CHECK_FP8 defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
|
||||
#define CHECK_INPUT_GUARD(x) CHECK_INPUT(x); const at::cuda::OptionalCUDAGuard guard(x.device())
|
||||
17
src/curobo/curobolib/cpp/cuda_precisions.h
Normal file
17
src/curobo/curobolib/cpp/cuda_precisions.h
Normal file
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
#include "check_cuda.h"
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
#if CHECK_FP8
|
||||
#include <cuda_fp8.h>
|
||||
#endif
|
||||
@@ -13,7 +13,7 @@
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <vector>
|
||||
|
||||
#include "check_cuda.h"
|
||||
// CUDA forward declarations
|
||||
|
||||
std::vector<torch::Tensor>self_collision_distance(
|
||||
@@ -168,13 +168,6 @@ backward_pose_distance(torch::Tensor out_grad_p,
|
||||
|
||||
// C++ interface
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
std::vector<torch::Tensor>self_collision_distance_wrapper(
|
||||
torch::Tensor out_distance, torch::Tensor out_vec,
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <vector>
|
||||
|
||||
#include "check_cuda.h"
|
||||
|
||||
// CUDA forward declarations
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
@@ -33,7 +35,9 @@ std::vector<torch::Tensor>kin_fused_forward(
|
||||
const torch::Tensor joint_map_type,
|
||||
const torch::Tensor store_link_map,
|
||||
const torch::Tensor link_sphere_map,
|
||||
const torch::Tensor joint_offset_map,
|
||||
const int batch_size,
|
||||
const int n_joints,
|
||||
const int n_spheres,
|
||||
const bool use_global_cumul = false);
|
||||
|
||||
@@ -52,76 +56,20 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
|
||||
const torch::Tensor store_link_map,
|
||||
const torch::Tensor link_sphere_map,
|
||||
const torch::Tensor link_chain_map,
|
||||
const torch::Tensor joint_offset_map,
|
||||
const int batch_size,
|
||||
const int n_joints,
|
||||
const int n_spheres,
|
||||
const bool sparsity_opt = true,
|
||||
const bool use_global_cumul = false);
|
||||
|
||||
// C++ interface
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
std::vector<torch::Tensor>kin_forward_wrapper(
|
||||
torch::Tensor link_pos, torch::Tensor link_quat,
|
||||
torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
|
||||
const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
|
||||
const torch::Tensor robot_spheres, const torch::Tensor link_map,
|
||||
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
|
||||
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
|
||||
const int batch_size, const int n_spheres,
|
||||
const bool use_global_cumul = false)
|
||||
{
|
||||
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
|
||||
|
||||
// TODO: add check input
|
||||
return kin_fused_forward(
|
||||
link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
|
||||
fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
|
||||
store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>kin_backward_wrapper(
|
||||
torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
|
||||
const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
|
||||
const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
|
||||
const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
|
||||
const torch::Tensor link_map, const torch::Tensor joint_map,
|
||||
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
|
||||
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
|
||||
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
|
||||
const bool use_global_cumul = false)
|
||||
{
|
||||
const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
|
||||
|
||||
return kin_fused_backward_16t(
|
||||
grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
|
||||
global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
|
||||
joint_map, joint_map_type, store_link_map, link_sphere_map,
|
||||
link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor>
|
||||
matrix_to_quaternion_wrapper(torch::Tensor out_quat,
|
||||
const torch::Tensor in_rot // batch_size, 3
|
||||
)
|
||||
{
|
||||
const at::cuda::OptionalCUDAGuard guard(in_rot.device());
|
||||
|
||||
CHECK_INPUT(in_rot);
|
||||
CHECK_INPUT(out_quat);
|
||||
return matrix_to_quaternion(out_quat, in_rot);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
||||
{
|
||||
m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
|
||||
m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
|
||||
m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
|
||||
m.def("forward", &kin_fused_forward, "Kinematics fused forward (CUDA)");
|
||||
m.def("backward", &kin_fused_backward_16t, "Kinematics fused backward (CUDA)");
|
||||
m.def("matrix_to_quaternion", &matrix_to_quaternion,
|
||||
"Rotation Matrix to Quaternion (CUDA)");
|
||||
}
|
||||
|
||||
@@ -10,12 +10,13 @@
|
||||
*/
|
||||
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include "helper_math.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include "check_cuda.h"
|
||||
#include <vector>
|
||||
|
||||
#define M 4
|
||||
@@ -28,13 +29,6 @@
|
||||
#define Y_ROT 4
|
||||
#define Z_ROT 5
|
||||
|
||||
#define X_PRISM_NEG 6
|
||||
#define Y_PRISM_NEG 7
|
||||
#define Z_PRISM_NEG 8
|
||||
#define X_ROT_NEG 9
|
||||
#define Y_ROT_NEG 10
|
||||
#define Z_ROT_NEG 11
|
||||
|
||||
#define MAX_BATCH_PER_BLOCK 32 // tunable parameter for improving occupancy
|
||||
#define MAX_BW_BATCH_PER_BLOCK 16 // tunable parameter for improving occupancy
|
||||
|
||||
@@ -46,6 +40,8 @@ namespace Curobo
|
||||
{
|
||||
namespace Kinematics
|
||||
{
|
||||
|
||||
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
|
||||
float3 scale, psum_t& sum_out)
|
||||
@@ -293,36 +289,11 @@ namespace Curobo
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void update_joint_type_direction(int& j_type, int8_t& axis_sign)
|
||||
{
|
||||
// Assume that input j_type >= 0 . Check fixed joint outside of this function.
|
||||
|
||||
// Don't do anything if j_type < 6. j_type range is [0, 11]
|
||||
// divergence here.
|
||||
axis_sign = 1;
|
||||
|
||||
if (j_type >= 6)
|
||||
{
|
||||
j_type -= 6;
|
||||
axis_sign = -1;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void update_joint_type_direction(int& j_type)
|
||||
{
|
||||
// Assume that input j_type >= 0 . Check fixed joint outside of this function.
|
||||
|
||||
// Don't do anything if j_type < 6. j_type range is [0, 11]
|
||||
// divergence here.
|
||||
if (j_type >= 6)
|
||||
{
|
||||
j_type -= 6;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void update_axis_direction(
|
||||
float& angle,
|
||||
int & j_type)
|
||||
int & j_type,
|
||||
const float2 &j_offset)
|
||||
{
|
||||
// Assume that input j_type >= 0 . Check fixed joint outside of this function.
|
||||
// sign should be +ve <= 5 and -ve >5
|
||||
@@ -330,9 +301,7 @@ namespace Curobo
|
||||
// cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 =
|
||||
// +ve,
|
||||
// then this code should be j_type - 5.
|
||||
angle = -1 * copysignf(1.0, j_type - 6) * angle;
|
||||
|
||||
update_joint_type_direction(j_type);
|
||||
angle = j_offset.x * angle + j_offset.y;
|
||||
}
|
||||
|
||||
// In the following versions of rot_fn, some non-nan values may become nan as we
|
||||
@@ -462,7 +431,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
rot_backward_translation(const float3& vec, float *cumul_mat, float *l_pos,
|
||||
const float3& loc_grad, psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
const float3& loc_grad, psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
float3 e_pos, j_pos;
|
||||
|
||||
@@ -481,7 +450,7 @@ namespace Curobo
|
||||
rot_backward_rotation(const float3 vec,
|
||||
const float3 grad_vec,
|
||||
psum_t & grad_q,
|
||||
const int8_t axis_sign = 1)
|
||||
const float axis_sign = 1)
|
||||
{
|
||||
grad_q += axis_sign * dot(vec, grad_vec);
|
||||
}
|
||||
@@ -489,7 +458,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
prism_backward_translation(const float3 vec, const float3 grad_vec,
|
||||
psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
grad_q += axis_sign * dot(vec, grad_vec);
|
||||
}
|
||||
@@ -497,7 +466,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
z_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
|
||||
float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
float3 vec =
|
||||
make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
|
||||
@@ -512,7 +481,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
x_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
|
||||
float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
float3 vec =
|
||||
make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
|
||||
@@ -527,7 +496,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
y_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
|
||||
float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
float3 vec =
|
||||
make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
|
||||
@@ -542,7 +511,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
xyz_prism_backward_translation(float *cumul_mat, float3& loc_grad,
|
||||
psum_t& grad_q, int xyz, const int8_t axis_sign = 1)
|
||||
psum_t& grad_q, int xyz, const float axis_sign = 1)
|
||||
{
|
||||
prism_backward_translation(
|
||||
make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
|
||||
@@ -553,7 +522,7 @@ namespace Curobo
|
||||
__device__ __forceinline__ void x_prism_backward_translation(float *cumul_mat,
|
||||
float3 & loc_grad,
|
||||
psum_t & grad_q,
|
||||
const int8_t axis_sign = 1)
|
||||
const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
prism_backward_translation(
|
||||
@@ -564,7 +533,7 @@ namespace Curobo
|
||||
__device__ __forceinline__ void y_prism_backward_translation(float *cumul_mat,
|
||||
float3 & loc_grad,
|
||||
psum_t & grad_q,
|
||||
const int8_t axis_sign = 1)
|
||||
const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
prism_backward_translation(
|
||||
@@ -575,16 +544,15 @@ namespace Curobo
|
||||
__device__ __forceinline__ void z_prism_backward_translation(float *cumul_mat,
|
||||
float3 & loc_grad,
|
||||
psum_t & grad_q,
|
||||
const int8_t axis_sign = 1)
|
||||
const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
prism_backward_translation(
|
||||
make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
|
||||
float& grad_q, int xyz, const int8_t axis_sign = 1)
|
||||
float& grad_q, int xyz, const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
rot_backward_translation(
|
||||
@@ -592,10 +560,11 @@ namespace Curobo
|
||||
&cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
|
||||
}
|
||||
|
||||
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
x_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
|
||||
psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
rot_backward_translation(
|
||||
@@ -606,7 +575,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
y_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
|
||||
psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
rot_backward_translation(
|
||||
@@ -617,7 +586,7 @@ namespace Curobo
|
||||
template<typename psum_t>
|
||||
__device__ __forceinline__ void
|
||||
z_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
|
||||
psum_t& grad_q, const int8_t axis_sign = 1)
|
||||
psum_t& grad_q, const float axis_sign = 1)
|
||||
{
|
||||
// get rotation vector:
|
||||
rot_backward_translation(
|
||||
@@ -629,18 +598,19 @@ namespace Curobo
|
||||
// This one should be about 10% faster.
|
||||
template<typename scalar_t, bool use_global_cumul>
|
||||
__global__ void
|
||||
kin_fused_warp_kernel2(scalar_t *link_pos, // batchSize xz store_n_links x M x M
|
||||
scalar_t *link_quat, // batchSize x store_n_links x M x M
|
||||
kin_fused_warp_kernel2(float *link_pos, // batchSize xz store_n_links x M x M
|
||||
float *link_quat, // batchSize x store_n_links x M x M
|
||||
scalar_t *b_robot_spheres, // batchSize x nspheres x M
|
||||
scalar_t *global_cumul_mat,
|
||||
const scalar_t *q, // batchSize x njoints
|
||||
const scalar_t *fixedTransform, // nlinks x M x M
|
||||
const scalar_t *robot_spheres, // nspheres x M
|
||||
float *global_cumul_mat, // batchSize x nlinks x M x M
|
||||
const float *q, // batchSize x njoints
|
||||
const float *fixedTransform, // nlinks x M x M
|
||||
const float *robot_spheres, // nspheres x M
|
||||
const int8_t *jointMapType, // nlinks
|
||||
const int16_t *jointMap, // nlinks
|
||||
const int16_t *linkMap, // nlinks
|
||||
const int16_t *storeLinkMap, // store_n_links
|
||||
const int16_t *linkSphereMap, // nspheres
|
||||
const float *jointOffset, // nlinks
|
||||
const int batchSize, const int nspheres,
|
||||
const int nlinks, const int njoints,
|
||||
const int store_n_links)
|
||||
@@ -688,7 +658,8 @@ namespace Curobo
|
||||
else
|
||||
{
|
||||
float angle = q[batch * njoints + jointMap[l]];
|
||||
update_axis_direction(angle, j_type);
|
||||
float2 angle_offset = *(float2 *)&jointOffset[l*2];
|
||||
update_axis_direction(angle, j_type, angle_offset);
|
||||
|
||||
if (j_type <= Z_PRISM)
|
||||
{
|
||||
@@ -794,20 +765,21 @@ namespace Curobo
|
||||
template<typename scalar_t, typename psum_t, bool use_global_cumul,
|
||||
bool enable_sparsity_opt, int16_t MAX_JOINTS, bool PARALLEL_WRITE>
|
||||
__global__ void kin_fused_backward_kernel3(
|
||||
scalar_t *grad_out_link_q, // batchSize * njoints
|
||||
const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
|
||||
const scalar_t *grad_nlinks_quat,
|
||||
float *grad_out_link_q, // batchSize * njoints
|
||||
const float *grad_nlinks_pos, // batchSize * store_n_links * 16
|
||||
const float *grad_nlinks_quat,
|
||||
const scalar_t *grad_spheres, // batchSize * nspheres * 4
|
||||
const scalar_t *global_cumul_mat,
|
||||
const scalar_t *q, // batchSize * njoints
|
||||
const scalar_t *fixedTransform, // nlinks * 16
|
||||
const scalar_t *robotSpheres, // batchSize * nspheres * 4
|
||||
const float *global_cumul_mat,
|
||||
const float *q, // batchSize * njoints
|
||||
const float *fixedTransform, // nlinks * 16
|
||||
const float *robotSpheres, // batchSize * nspheres * 4
|
||||
const int8_t *jointMapType, // nlinks
|
||||
const int16_t *jointMap, // nlinks
|
||||
const int16_t *linkMap, // nlinks
|
||||
const int16_t *storeLinkMap, // store_n_links
|
||||
const int16_t *linkSphereMap, // nspheres
|
||||
const int16_t *linkChainMap, // nlinks*nlinks
|
||||
const float *jointOffset, // nlinks*2
|
||||
const int batchSize, const int nspheres, const int nlinks,
|
||||
const int njoints, const int store_n_links)
|
||||
{
|
||||
@@ -819,8 +791,7 @@ namespace Curobo
|
||||
|
||||
if (batch >= batchSize)
|
||||
return;
|
||||
|
||||
// Each thread computes one element of the cumul_mat.
|
||||
// Each thread computes one element of the cumul_mat.
|
||||
// first 4 threads compute a row of the output;
|
||||
const int elem_idx = threadIdx.x % 16;
|
||||
const int col_idx = elem_idx % 4;
|
||||
@@ -832,6 +803,7 @@ namespace Curobo
|
||||
for (int l = 0; l < nlinks; l++)
|
||||
{
|
||||
int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
|
||||
|
||||
cumul_mat[outAddrStart + elem_idx] =
|
||||
global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
|
||||
}
|
||||
@@ -857,8 +829,8 @@ namespace Curobo
|
||||
else
|
||||
{
|
||||
float angle = q[batch * njoints + jointMap[l]];
|
||||
|
||||
update_axis_direction(angle, j_type);
|
||||
float2 angle_offset = *(float2 *)&jointOffset[l*2];
|
||||
update_axis_direction(angle, j_type, angle_offset);
|
||||
|
||||
if (j_type <= Z_PRISM)
|
||||
{
|
||||
@@ -949,15 +921,11 @@ namespace Curobo
|
||||
{
|
||||
continue;
|
||||
}
|
||||
int8_t axis_sign = 1;
|
||||
float axis_sign = jointOffset[j*2];
|
||||
|
||||
int j_type = jointMapType[j];
|
||||
|
||||
if (j_type != FIXED)
|
||||
{
|
||||
update_joint_type_direction(j_type, axis_sign);
|
||||
}
|
||||
|
||||
|
||||
if (j_type == Z_ROT)
|
||||
{
|
||||
float result = 0.0;
|
||||
@@ -1046,12 +1014,13 @@ namespace Curobo
|
||||
// for (int16_t k = joints_per_thread; k >= 0; k--)
|
||||
for (int16_t k = 0; k < joints_per_thread; k++)
|
||||
{
|
||||
// int16_t j = elem_idx * joints_per_thread + k;
|
||||
int16_t j = elem_idx + k * 16;
|
||||
int16_t j = elem_idx * joints_per_thread + k;
|
||||
//int16_t j = elem_idx + k * 16;
|
||||
// int16_t j = elem_idx + k * 16; // (threadidx.x % 16) + k * 16 (0 to 16)
|
||||
|
||||
// int16_t j = k * M + elem_idx;
|
||||
if ((j > max_lmap) || (j < 0))
|
||||
continue;
|
||||
if ((j > max_lmap))
|
||||
break;
|
||||
|
||||
// This can be spread across threads as they are not sequential?
|
||||
if (linkChainMap[l_map * nlinks + j] == 0.0)
|
||||
@@ -1061,12 +1030,8 @@ namespace Curobo
|
||||
int16_t j_idx = jointMap[j];
|
||||
int j_type = jointMapType[j];
|
||||
|
||||
int8_t axis_sign = 1;
|
||||
float axis_sign = jointOffset[j*2];
|
||||
|
||||
if (j_type != FIXED)
|
||||
{
|
||||
update_joint_type_direction(j_type, axis_sign);
|
||||
}
|
||||
|
||||
// get rotation vector:
|
||||
if (j_type == Z_ROT)
|
||||
@@ -1090,8 +1055,9 @@ namespace Curobo
|
||||
g_position, g_orientation, psum_grad[j_idx], axis_sign);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
if (PARALLEL_WRITE)
|
||||
{
|
||||
// accumulate the partial sums across the 16 threads
|
||||
@@ -1114,8 +1080,8 @@ namespace Curobo
|
||||
|
||||
for (int16_t j = 0; j < joints_per_thread; j++)
|
||||
{
|
||||
// const int16_t j_idx = elem_idx * joints_per_thread + j;
|
||||
const int16_t j_idx = elem_idx + j * 16;
|
||||
const int16_t j_idx = elem_idx * joints_per_thread + j;
|
||||
//const int16_t j_idx = elem_idx + j * 16;
|
||||
|
||||
if (j_idx >= njoints)
|
||||
{
|
||||
@@ -1151,7 +1117,7 @@ namespace Curobo
|
||||
{
|
||||
{
|
||||
grad_out_link_q[batch * njoints + j] =
|
||||
psum_grad[j]; // write the sum to memory
|
||||
(float) psum_grad[j]; // write the sum to memory
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1195,11 +1161,27 @@ std::vector<torch::Tensor>kin_fused_forward(
|
||||
const torch::Tensor robot_spheres, const torch::Tensor link_map,
|
||||
const torch::Tensor joint_map, const torch::Tensor joint_map_type,
|
||||
const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
|
||||
const int batch_size, const int n_spheres,
|
||||
const torch::Tensor joint_offset_map,
|
||||
const int batch_size,
|
||||
const int n_joints,const int n_spheres,
|
||||
const bool use_global_cumul = false)
|
||||
{
|
||||
using namespace Curobo::Kinematics;
|
||||
const int n_joints = joint_vec.size(0) / batch_size;
|
||||
CHECK_INPUT_GUARD(joint_vec);
|
||||
CHECK_INPUT(link_pos);
|
||||
CHECK_INPUT(link_quat);
|
||||
CHECK_INPUT(global_cumul_mat);
|
||||
CHECK_INPUT(batch_robot_spheres);
|
||||
CHECK_INPUT(fixed_transform);
|
||||
CHECK_INPUT(robot_spheres);
|
||||
CHECK_INPUT(link_map);
|
||||
CHECK_INPUT(joint_map);
|
||||
CHECK_INPUT(joint_map_type);
|
||||
CHECK_INPUT(store_link_map);
|
||||
CHECK_INPUT(link_sphere_map);
|
||||
//CHECK_INPUT(link_chain_map);
|
||||
CHECK_INPUT(joint_offset_map);
|
||||
|
||||
const int n_links = link_map.size(0);
|
||||
const int store_n_links = link_pos.size(1);
|
||||
assert(joint_map.dtype() == torch::kInt16);
|
||||
@@ -1238,38 +1220,42 @@ std::vector<torch::Tensor>kin_fused_forward(
|
||||
if (use_global_cumul)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
link_pos.scalar_type(), "kin_fused_forward", ([&] {
|
||||
batch_robot_spheres.scalar_type(), "kin_fused_forward", ([&] {
|
||||
kin_fused_warp_kernel2<scalar_t, true>
|
||||
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
|
||||
link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
|
||||
link_pos.data_ptr<float>(), link_quat.data_ptr<float>(),
|
||||
batch_robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<scalar_t>(),
|
||||
joint_vec.data_ptr<scalar_t>(),
|
||||
fixed_transform.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<float>(),
|
||||
joint_vec.data_ptr<float>(),
|
||||
fixed_transform.data_ptr<float>(),
|
||||
robot_spheres.data_ptr<float>(),
|
||||
joint_map_type.data_ptr<int8_t>(),
|
||||
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
|
||||
store_link_map.data_ptr<int16_t>(),
|
||||
link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
|
||||
link_sphere_map.data_ptr<int16_t>(),
|
||||
joint_offset_map.data_ptr<float>(),
|
||||
batch_size, n_spheres,
|
||||
n_links, n_joints, store_n_links);
|
||||
}));
|
||||
}
|
||||
else
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
link_pos.scalar_type(), "kin_fused_forward", ([&] {
|
||||
batch_robot_spheres.scalar_type(), "kin_fused_forward", ([&] {
|
||||
kin_fused_warp_kernel2<scalar_t, false>
|
||||
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
|
||||
link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
|
||||
link_pos.data_ptr<float>(), link_quat.data_ptr<float>(),
|
||||
batch_robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<scalar_t>(),
|
||||
joint_vec.data_ptr<scalar_t>(),
|
||||
fixed_transform.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<float>(),
|
||||
joint_vec.data_ptr<float>(),
|
||||
fixed_transform.data_ptr<float>(),
|
||||
robot_spheres.data_ptr<float>(),
|
||||
joint_map_type.data_ptr<int8_t>(),
|
||||
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
|
||||
store_link_map.data_ptr<int16_t>(),
|
||||
link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
|
||||
link_sphere_map.data_ptr<int16_t>(),
|
||||
joint_offset_map.data_ptr<float>(),
|
||||
batch_size, n_spheres,
|
||||
n_links, n_joints, store_n_links);
|
||||
}));
|
||||
}
|
||||
@@ -1292,15 +1278,29 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
|
||||
const torch::Tensor link_map, const torch::Tensor joint_map,
|
||||
const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
|
||||
const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
|
||||
const int batch_size, const int n_spheres, const bool sparsity_opt = true,
|
||||
const torch::Tensor joint_offset_map,
|
||||
const int batch_size, const int n_joints, const int n_spheres, const bool sparsity_opt = true,
|
||||
const bool use_global_cumul = false)
|
||||
{
|
||||
using namespace Curobo::Kinematics;
|
||||
|
||||
const int n_joints = joint_vec.size(0) / batch_size;
|
||||
CHECK_INPUT_GUARD(joint_vec);
|
||||
CHECK_INPUT(grad_out);
|
||||
CHECK_INPUT(grad_nlinks_pos);
|
||||
CHECK_INPUT(grad_nlinks_quat);
|
||||
CHECK_INPUT(global_cumul_mat);
|
||||
CHECK_INPUT(fixed_transform);
|
||||
CHECK_INPUT(robot_spheres);
|
||||
CHECK_INPUT(link_map);
|
||||
CHECK_INPUT(joint_map);
|
||||
CHECK_INPUT(joint_map_type);
|
||||
CHECK_INPUT(store_link_map);
|
||||
CHECK_INPUT(link_sphere_map);
|
||||
CHECK_INPUT(link_chain_map);
|
||||
CHECK_INPUT(joint_offset_map);
|
||||
|
||||
const int n_links = link_map.size(0);
|
||||
const int store_n_links = store_link_map.size(0);
|
||||
|
||||
|
||||
// assert(n_links < 128);
|
||||
assert(n_joints < 128); // for larger num. of joints, change kernel3's
|
||||
// MAX_JOINTS template value.
|
||||
@@ -1341,72 +1341,77 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
assert(sparsity_opt);
|
||||
|
||||
if (use_global_cumul)
|
||||
{
|
||||
if (n_joints < 16)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, float, true, true, 16, true>
|
||||
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, double, true, true, 16, true>
|
||||
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
|
||||
grad_out.data_ptr<scalar_t>(),
|
||||
grad_nlinks_pos.data_ptr<scalar_t>(),
|
||||
grad_nlinks_quat.data_ptr<scalar_t>(),
|
||||
grad_out.data_ptr<float>(),
|
||||
grad_nlinks_pos.data_ptr<float>(),
|
||||
grad_nlinks_quat.data_ptr<float>(),
|
||||
grad_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<scalar_t>(),
|
||||
joint_vec.data_ptr<scalar_t>(),
|
||||
fixed_transform.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<float>(),
|
||||
joint_vec.data_ptr<float>(),
|
||||
fixed_transform.data_ptr<float>(),
|
||||
robot_spheres.data_ptr<float>(),
|
||||
joint_map_type.data_ptr<int8_t>(),
|
||||
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
|
||||
store_link_map.data_ptr<int16_t>(),
|
||||
link_sphere_map.data_ptr<int16_t>(),
|
||||
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
|
||||
link_chain_map.data_ptr<int16_t>(),
|
||||
joint_offset_map.data_ptr<float>(),
|
||||
batch_size, n_spheres,
|
||||
n_links, n_joints, store_n_links);
|
||||
}));
|
||||
}
|
||||
else if (n_joints < 64)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, float, true, true, 64, true>
|
||||
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, double, true, true, 64, true>
|
||||
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
|
||||
grad_out.data_ptr<scalar_t>(),
|
||||
grad_nlinks_pos.data_ptr<scalar_t>(),
|
||||
grad_nlinks_quat.data_ptr<scalar_t>(),
|
||||
grad_out.data_ptr<float>(),
|
||||
grad_nlinks_pos.data_ptr<float>(),
|
||||
grad_nlinks_quat.data_ptr<float>(),
|
||||
grad_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<scalar_t>(),
|
||||
joint_vec.data_ptr<scalar_t>(),
|
||||
fixed_transform.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<float>(),
|
||||
joint_vec.data_ptr<float>(),
|
||||
fixed_transform.data_ptr<float>(),
|
||||
robot_spheres.data_ptr<float>(),
|
||||
joint_map_type.data_ptr<int8_t>(),
|
||||
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
|
||||
store_link_map.data_ptr<int16_t>(),
|
||||
link_sphere_map.data_ptr<int16_t>(),
|
||||
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
|
||||
link_chain_map.data_ptr<int16_t>(),
|
||||
joint_offset_map.data_ptr<float>(),
|
||||
batch_size, n_spheres,
|
||||
n_links, n_joints, store_n_links);
|
||||
}));
|
||||
}
|
||||
else
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, float, true, true, 128, true>
|
||||
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, double, true, true, 128, true>
|
||||
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
|
||||
grad_out.data_ptr<scalar_t>(),
|
||||
grad_nlinks_pos.data_ptr<scalar_t>(),
|
||||
grad_nlinks_quat.data_ptr<scalar_t>(),
|
||||
grad_out.data_ptr<float>(),
|
||||
grad_nlinks_pos.data_ptr<float>(),
|
||||
grad_nlinks_quat.data_ptr<float>(),
|
||||
grad_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<scalar_t>(),
|
||||
joint_vec.data_ptr<scalar_t>(),
|
||||
fixed_transform.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<float>(),
|
||||
joint_vec.data_ptr<float>(),
|
||||
fixed_transform.data_ptr<float>(),
|
||||
robot_spheres.data_ptr<float>(),
|
||||
joint_map_type.data_ptr<int8_t>(),
|
||||
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
|
||||
store_link_map.data_ptr<int16_t>(),
|
||||
link_sphere_map.data_ptr<int16_t>(),
|
||||
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
|
||||
link_chain_map.data_ptr<int16_t>(),
|
||||
joint_offset_map.data_ptr<float>(),
|
||||
batch_size, n_spheres,
|
||||
n_links, n_joints, store_n_links);
|
||||
}));
|
||||
}
|
||||
@@ -1417,22 +1422,25 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
|
||||
{
|
||||
//
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, float, false, true, 128, true>
|
||||
grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
|
||||
kin_fused_backward_kernel3<scalar_t, double, false, true, 128, true>
|
||||
<< < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
|
||||
grad_out.data_ptr<scalar_t>(),
|
||||
grad_nlinks_pos.data_ptr<scalar_t>(),
|
||||
grad_nlinks_quat.data_ptr<scalar_t>(),
|
||||
grad_out.data_ptr<float>(),
|
||||
grad_nlinks_pos.data_ptr<float>(),
|
||||
grad_nlinks_quat.data_ptr<float>(),
|
||||
grad_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<scalar_t>(),
|
||||
joint_vec.data_ptr<scalar_t>(),
|
||||
fixed_transform.data_ptr<scalar_t>(),
|
||||
robot_spheres.data_ptr<scalar_t>(),
|
||||
global_cumul_mat.data_ptr<float>(),
|
||||
joint_vec.data_ptr<float>(),
|
||||
fixed_transform.data_ptr<float>(),
|
||||
robot_spheres.data_ptr<float>(),
|
||||
joint_map_type.data_ptr<int8_t>(),
|
||||
joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
|
||||
joint_map.data_ptr<int16_t>(),
|
||||
link_map.data_ptr<int16_t>(),
|
||||
store_link_map.data_ptr<int16_t>(),
|
||||
link_sphere_map.data_ptr<int16_t>(),
|
||||
link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
|
||||
link_chain_map.data_ptr<int16_t>(),
|
||||
joint_offset_map.data_ptr<float>(),
|
||||
batch_size, n_spheres,
|
||||
n_links, n_joints, store_n_links);
|
||||
}));
|
||||
}
|
||||
@@ -1447,7 +1455,8 @@ matrix_to_quaternion(torch::Tensor out_quat,
|
||||
)
|
||||
{
|
||||
using namespace Curobo::Kinematics;
|
||||
|
||||
CHECK_INPUT(out_quat);
|
||||
CHECK_INPUT_GUARD(in_rot);
|
||||
// we compute the warp threads based on number of boxes:
|
||||
|
||||
// TODO: verify this math
|
||||
|
||||
@@ -807,6 +807,7 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
|
||||
{
|
||||
using namespace Curobo::Optimization;
|
||||
|
||||
|
||||
// call first kernel:
|
||||
//const bool use_experimental = true;
|
||||
const bool use_fixed_m = true;
|
||||
|
||||
@@ -15,23 +15,13 @@
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include "helper_math.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
#include <vector>
|
||||
#include <torch/torch.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "check_cuda.h"
|
||||
#include "cuda_precisions.h"
|
||||
|
||||
// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
|
||||
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
|
||||
#include <cuda_fp8.h>
|
||||
#endif
|
||||
#define M 4
|
||||
#define VOXEL_DEBUG true
|
||||
#define VOXEL_UNOBSERVED_DISTANCE -1000.0
|
||||
@@ -63,8 +53,7 @@ namespace Curobo
|
||||
}
|
||||
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
|
||||
|
||||
#if CHECK_FP8
|
||||
__device__ __forceinline__ float
|
||||
get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx)
|
||||
{
|
||||
@@ -704,16 +693,21 @@ float4 &sum_pt)
|
||||
int3 &xyz_grid,
|
||||
float &interpolated_distance,
|
||||
int &voxel_idx)
|
||||
{
|
||||
{
|
||||
|
||||
|
||||
// convert location to index: can use floor to cast to int.
|
||||
// to account for negative values, add 0.5 * bounds.
|
||||
const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z);
|
||||
const float3 sphere = make_float3(loc_sphere.x, loc_sphere.y, loc_sphere.z);
|
||||
// xyz_loc = make_int3(floorf((sphere + 0.5 * loc_grid) / loc_grid_params.w));
|
||||
xyz_loc = make_int3(floorf((sphere) / loc_grid_params.w) + (0.5 * loc_grid/loc_grid_params.w));
|
||||
xyz_grid = make_int3(floorf(loc_grid / loc_grid_params.w));
|
||||
//xyz_loc = make_int3(floorf((sphere + 0.5 * loc_grid) / loc_grid_params.w));
|
||||
const float inv_voxel_size = 1.0/loc_grid_params.w;
|
||||
//xyz_loc = make_int3(sphere * inv_voxel_size) + make_int3(0.5 * loc_grid * inv_voxel_size);
|
||||
|
||||
xyz_loc = make_int3((sphere + 0.5 * loc_grid) * inv_voxel_size);
|
||||
|
||||
//xyz_loc = make_int3(sphere / loc_grid_params.w) + make_int3(floorf(0.5 * loc_grid/loc_grid_params.w));
|
||||
xyz_grid = make_int3((loc_grid * inv_voxel_size)) + 1;
|
||||
|
||||
// find next nearest voxel to current point and then do weighted interpolation:
|
||||
voxel_idx = xyz_loc.x * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z;
|
||||
@@ -732,8 +726,8 @@ float4 &sum_pt)
|
||||
|
||||
|
||||
float3 delta = sphere - voxel_origin;
|
||||
int3 next_loc = make_int3(floorf((make_float3(xyz_loc) + normalize(delta))));
|
||||
float ratio = length(delta)/loc_grid_params.w;
|
||||
int3 next_loc = make_int3(((make_float3(xyz_loc) + normalize(delta))));
|
||||
float ratio = length(delta) * inv_voxel_size;
|
||||
|
||||
int next_voxel_idx = next_loc.x * xyz_grid.y * xyz_grid.z + next_loc.y * xyz_grid.z + next_loc.z;
|
||||
|
||||
@@ -2752,7 +2746,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3
|
||||
else
|
||||
{
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
|
||||
#if CHECK_FP8
|
||||
const auto fp8_type = torch::kFloat8_e4m3fn;
|
||||
#else
|
||||
const auto fp8_type = torch::kHalf;
|
||||
@@ -3217,7 +3211,7 @@ sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
|
||||
|
||||
int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
|
||||
#if CHECK_FP8
|
||||
const auto fp8_type = torch::kFloat8_e4m3fn;
|
||||
#else
|
||||
const auto fp8_type = torch::kHalf;
|
||||
@@ -3352,7 +3346,7 @@ swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
|
||||
|
||||
int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
|
||||
#if CHECK_FP8
|
||||
const auto fp8_type = torch::kFloat8_e4m3fn;
|
||||
#else
|
||||
const auto fp8_type = torch::kHalf;
|
||||
|
||||
@@ -43,100 +43,10 @@ def rotation_matrix_to_quaternion(in_mat, out_quat):
|
||||
|
||||
|
||||
class KinematicsFusedFunction(Function):
|
||||
@staticmethod
|
||||
def _call_cuda(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
r = kinematics_fused_cu.forward(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
False,
|
||||
)
|
||||
out_link_pos = r[0]
|
||||
out_link_quat = r[1]
|
||||
out_spheres = r[2]
|
||||
global_cumul_mat = r[3]
|
||||
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
|
||||
|
||||
@staticmethod
|
||||
def _call_backward_cuda(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
sparsity_opt=True,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
if grad_out.is_contiguous():
|
||||
grad_out = grad_out.view(-1)
|
||||
else:
|
||||
grad_out = grad_out.reshape(-1)
|
||||
r = kinematics_fused_cu.backward(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
sparsity_opt,
|
||||
False,
|
||||
)
|
||||
out_q = r[0].view(b_size, -1)
|
||||
|
||||
return out_q
|
||||
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
# link_mats: torch.Tensor,
|
||||
link_pos: torch.Tensor,
|
||||
link_quat: torch.Tensor,
|
||||
b_robot_spheres: torch.tensor,
|
||||
@@ -150,9 +60,17 @@ class KinematicsFusedFunction(Function):
|
||||
store_link_map: torch.Tensor,
|
||||
link_sphere_map: torch.Tensor,
|
||||
link_chain_map: torch.Tensor,
|
||||
joint_offset_map: torch.Tensor,
|
||||
grad_out: torch.Tensor,
|
||||
use_global_cumul: bool = True,
|
||||
):
|
||||
link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
|
||||
ctx.use_global_cumul = use_global_cumul
|
||||
b_shape = link_pos.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = b_robot_spheres.shape[1]
|
||||
n_joints = joint_seq.shape[-1]
|
||||
|
||||
r = kinematics_fused_cu.forward(
|
||||
link_pos,
|
||||
link_quat,
|
||||
b_robot_spheres,
|
||||
@@ -165,7 +83,17 @@ class KinematicsFusedFunction(Function):
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
joint_offset_map, # offset_joint_map
|
||||
b_size,
|
||||
n_joints,
|
||||
n_spheres,
|
||||
True,
|
||||
)
|
||||
out_link_pos = r[0]
|
||||
out_link_quat = r[1]
|
||||
out_spheres = r[2]
|
||||
global_cumul_mat = r[3]
|
||||
|
||||
ctx.save_for_backward(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
@@ -176,11 +104,11 @@ class KinematicsFusedFunction(Function):
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
joint_offset_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
)
|
||||
|
||||
return link_pos, link_quat, b_robot_spheres
|
||||
return out_link_pos, out_link_quat, out_spheres
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
|
||||
@@ -197,9 +125,11 @@ class KinematicsFusedFunction(Function):
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
joint_offset_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
) = ctx.saved_tensors
|
||||
|
||||
grad_joint = KinematicsFusedFunction._call_backward_cuda(
|
||||
grad_out,
|
||||
grad_out_link_pos,
|
||||
@@ -215,7 +145,9 @@ class KinematicsFusedFunction(Function):
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
joint_offset_map,
|
||||
True,
|
||||
use_global_cumul=ctx.use_global_cumul,
|
||||
)
|
||||
|
||||
return (
|
||||
@@ -233,53 +165,10 @@ class KinematicsFusedFunction(Function):
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
class KinematicsFusedGlobalCumulFunction(Function):
|
||||
@staticmethod
|
||||
def _call_cuda(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_size = b_shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
# print(n_spheres)
|
||||
# print(angle)
|
||||
r = kinematics_fused_cu.forward(
|
||||
link_pos_out,
|
||||
link_quat_out,
|
||||
robot_sphere_out,
|
||||
global_cumul_mat,
|
||||
angle,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
b_size,
|
||||
n_spheres,
|
||||
True,
|
||||
)
|
||||
out_link_pos = r[0]
|
||||
out_link_quat = r[1]
|
||||
out_spheres = r[2]
|
||||
global_cumul_mat = r[3]
|
||||
return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
|
||||
|
||||
@staticmethod
|
||||
def _call_backward_cuda(
|
||||
grad_out,
|
||||
@@ -296,18 +185,18 @@ class KinematicsFusedGlobalCumulFunction(Function):
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
joint_offset_map,
|
||||
sparsity_opt=True,
|
||||
use_global_cumul=False,
|
||||
):
|
||||
b_shape = link_pos_out.shape
|
||||
b_shape = grad_out.shape
|
||||
b_size = b_shape[0]
|
||||
# b_size = link_mat_out.shape[0]
|
||||
n_spheres = robot_sphere_out.shape[1]
|
||||
n_joints = angle.shape[-1]
|
||||
if grad_out.is_contiguous():
|
||||
grad_out = grad_out.view(-1)
|
||||
else:
|
||||
grad_out = grad_out.reshape(-1)
|
||||
# create backward tensors?
|
||||
|
||||
r = kinematics_fused_cu.backward(
|
||||
grad_out,
|
||||
link_pos_out,
|
||||
@@ -323,126 +212,17 @@ class KinematicsFusedGlobalCumulFunction(Function):
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
joint_offset_map, # offset_joint_map
|
||||
b_size,
|
||||
n_joints,
|
||||
n_spheres,
|
||||
sparsity_opt,
|
||||
True,
|
||||
use_global_cumul,
|
||||
)
|
||||
out_q = r[0] # .view(angle.shape)
|
||||
out_q = out_q.view(b_size, -1)
|
||||
out_q = r[0].view(b_size, -1)
|
||||
|
||||
return out_q
|
||||
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx,
|
||||
# link_mats: torch.Tensor,
|
||||
link_pos: torch.Tensor,
|
||||
link_quat: torch.Tensor,
|
||||
b_robot_spheres: torch.tensor,
|
||||
global_cumul_mat: torch.Tensor,
|
||||
joint_seq: torch.Tensor,
|
||||
fixed_transform: torch.tensor,
|
||||
robot_spheres: torch.tensor,
|
||||
link_map: torch.tensor,
|
||||
joint_map: torch.Tensor,
|
||||
joint_map_type: torch.Tensor,
|
||||
store_link_map: torch.Tensor,
|
||||
link_sphere_map: torch.Tensor,
|
||||
link_chain_map: torch.Tensor,
|
||||
grad_out: torch.Tensor,
|
||||
):
|
||||
if joint_seq.is_contiguous():
|
||||
joint_seq = joint_seq.view(-1)
|
||||
else:
|
||||
joint_seq = joint_seq.reshape(-1)
|
||||
(
|
||||
link_pos,
|
||||
link_quat,
|
||||
b_robot_spheres,
|
||||
global_cumul_mat,
|
||||
) = KinematicsFusedGlobalCumulFunction._call_cuda(
|
||||
link_pos,
|
||||
link_quat,
|
||||
b_robot_spheres,
|
||||
global_cumul_mat,
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
)
|
||||
ctx.save_for_backward(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
)
|
||||
|
||||
return link_pos, link_quat, b_robot_spheres
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
|
||||
grad_joint = None
|
||||
|
||||
if ctx.needs_input_grad[4]:
|
||||
(
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
grad_out,
|
||||
global_cumul_mat,
|
||||
) = ctx.saved_tensors
|
||||
grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
|
||||
grad_out,
|
||||
grad_out_link_pos,
|
||||
grad_out_link_quat,
|
||||
grad_out_spheres,
|
||||
global_cumul_mat,
|
||||
joint_seq,
|
||||
fixed_transform,
|
||||
robot_spheres,
|
||||
link_map,
|
||||
joint_map,
|
||||
joint_map_type,
|
||||
store_link_map,
|
||||
link_sphere_map,
|
||||
link_chain_map,
|
||||
True,
|
||||
)
|
||||
|
||||
return (
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
grad_joint,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
def get_cuda_kinematics(
|
||||
link_pos_seq,
|
||||
@@ -458,41 +238,28 @@ def get_cuda_kinematics(
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
joint_offset_map,
|
||||
grad_out_q,
|
||||
use_global_cumul: bool = True,
|
||||
):
|
||||
if use_global_cumul:
|
||||
link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
|
||||
link_pos_seq,
|
||||
link_quat_seq,
|
||||
batch_robot_spheres,
|
||||
global_cumul_mat,
|
||||
q_in,
|
||||
fixed_transform,
|
||||
link_spheres_tensor,
|
||||
link_map, # tells which link is attached to which link i
|
||||
joint_map, # tells which joint is attached to a link i
|
||||
joint_map_type, # joint type
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
grad_out_q,
|
||||
)
|
||||
else:
|
||||
link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
|
||||
link_pos_seq,
|
||||
link_quat_seq,
|
||||
batch_robot_spheres,
|
||||
global_cumul_mat,
|
||||
q_in,
|
||||
fixed_transform,
|
||||
link_spheres_tensor,
|
||||
link_map, # tells which link is attached to which link i
|
||||
joint_map, # tells which joint is attached to a link i
|
||||
joint_map_type, # joint type
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
grad_out_q,
|
||||
)
|
||||
# if not q_in.is_contiguous():
|
||||
# q_in = q_in.contiguous()
|
||||
link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
|
||||
link_pos_seq,
|
||||
link_quat_seq,
|
||||
batch_robot_spheres,
|
||||
global_cumul_mat,
|
||||
q_in,
|
||||
fixed_transform,
|
||||
link_spheres_tensor,
|
||||
link_map, # tells which link is attached to which link i
|
||||
joint_map, # tells which joint is attached to a link i
|
||||
joint_map_type, # joint type
|
||||
store_link_map,
|
||||
link_sphere_idx_map, # sphere idx map
|
||||
link_chain_map,
|
||||
joint_offset_map,
|
||||
grad_out_q,
|
||||
use_global_cumul,
|
||||
)
|
||||
return link_pos, link_quat, robot_spheres
|
||||
|
||||
Reference in New Issue
Block a user