improved joint space planning

2024-05-30 14:42:22 -07:00
parent 3bfed9d773
commit 0c51dd2da8
28 changed files with 1135 additions and 213 deletions
--- a/src/curobo/rollout/arm_base.py
+++ b/src/curobo/rollout/arm_base.py
@@ -261,7 +261,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
                log_warn(
                    "null space cost is deprecated, use null_space_weight in bound cost instead"
                )
-
+            self.cost_cfg.bound_cfg.dof = self.n_dofs
            self.bound_cost = BoundCost(self.cost_cfg.bound_cfg)

        if self.cost_cfg.manipulability_cfg is not None:
@@ -315,10 +315,12 @@ class ArmBase(RolloutBase, ArmBaseConfig):
        self.cost_cfg.bound_cfg.state_finite_difference_mode = (
            self.dynamics_model.state_finite_difference_mode
        )
-
+        self.cost_cfg.bound_cfg.dof = self.n_dofs
+        self.constraint_cfg.bound_cfg.dof = self.n_dofs
        self.bound_constraint = BoundCost(self.constraint_cfg.bound_cfg)

        if self.convergence_cfg.null_space_cfg is not None:
+            self.convergence_cfg.null_space_cfg.dof = self.n_dofs
            self.null_convergence = DistCost(self.convergence_cfg.null_space_cfg)

        # set start state:
@@ -578,6 +580,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
        ----------
        action_seq: torch.Tensor [num_particles, horizon, d_act]
        """
+
        # print(act_seq.shape, self._goal_buffer.batch_current_state_idx)
        if self.start_state is None:
            raise ValueError("start_state is not set in rollout")
@@ -585,6 +588,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
            state = self.dynamics_model.forward(
                self.start_state, act_seq, self._goal_buffer.batch_current_state_idx
            )
+
        with profiler.record_function("cost/all"):
            cost_seq = self.cost_fn(state, act_seq)

--- a/src/curobo/rollout/arm_reacher.py
+++ b/src/curobo/rollout/arm_reacher.py
@@ -174,6 +174,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
        self._n_goalset = 1

        if self.cost_cfg.cspace_cfg is not None:
+            self.cost_cfg.cspace_cfg.dof = self.d_action
            # self.cost_cfg.cspace_cfg.update_vec_weight(self.dynamics_model.cspace_distance_weight)
            self.dist_cost = DistCost(self.cost_cfg.cspace_cfg)
        if self.cost_cfg.pose_cfg is not None:
@@ -226,6 +227,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                if i != self.kinematics.ee_link:
                    self._link_pose_convergence[i] = PoseCost(self.convergence_cfg.link_pose_cfg)
        if self.convergence_cfg.cspace_cfg is not None:
+            self.convergence_cfg.cspace_cfg.dof = self.d_action
            self.cspace_convergence = DistCost(self.convergence_cfg.cspace_cfg)

        # check if g_dist is required in any of the cost terms:
--- a/src/curobo/rollout/cost/bound_cost.py
+++ b/src/curobo/rollout/cost/bound_cost.py
@@ -22,7 +22,7 @@ from curobo.cuda_robot_model.types import JointLimits
 from curobo.types.robot import JointState
 from curobo.types.tensor import T_DOF
 from curobo.util.logger import log_error
-from curobo.util.torch_utils import get_torch_jit_decorator
+from curobo.util.torch_utils import get_cache_fn_decorator, get_torch_jit_decorator
 from curobo.util.warp import init_warp

 # Local Folder
@@ -49,6 +49,7 @@ class BoundCostConfig(CostConfig):
    activation_distance: Union[torch.Tensor, float] = 0.0
    state_finite_difference_mode: str = "BACKWARD"
    null_space_weight: Optional[List[float]] = None
+    use_l2_kernel: bool = False

    def set_bounds(self, bounds: JointLimits, teleport_mode: bool = False):
        self.joint_limits = bounds.clone()
@@ -104,12 +105,21 @@ class BoundCost(CostBase, BoundCostConfig):
            (0, 0, 0), device=self.tensor_args.device, dtype=self.tensor_args.dtype
        )
        self._out_gv_buffer = self._out_ga_buffer = self._out_gj_buffer = empty_buffer
+        if self.use_l2_kernel:
+            if self.cost_type == BoundCostType.POSITION:
+                self._l2_cost = make_bound_pos_kernel(self.dof)
+            if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
+                self._l2_cost = make_bound_pos_smooth_kernel(self.dof)

    def update_batch_size(self, batch, horizon, dof):
        if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
            self._out_c_buffer = torch.zeros(
                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
            )
+            self._out_c_sum_buffer = torch.zeros(
+                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
            self._out_gp_buffer = torch.zeros(
                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
            )
@@ -184,36 +194,61 @@ class BoundCost(CostBase, BoundCostConfig):
            retract_idx = self._retract_cfg_idx

        if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
-            # print(self.joint_limits.jerk.shape, self.joint_limits.position.shape)
-            cost = WarpBoundSmoothFunction.apply(
-                state_batch.position,
-                state_batch.velocity,
-                state_batch.acceleration,
-                state_batch.jerk,
-                retract_config,
-                retract_idx,
-                self.joint_limits.position,
-                self.joint_limits.velocity,
-                self.joint_limits.acceleration,
-                self.joint_limits.jerk,
-                self.weight,
-                self.activation_distance,
-                self.smooth_weight,
-                self.cspace_distance_weight,
-                self.null_space_weight,
-                self.vec_weight,
-                self._run_weight_vel,
-                self._run_weight_acc,
-                self._run_weight_jerk,
-                self._out_c_buffer,
-                self._out_gp_buffer,
-                self._out_gv_buffer,
-                self._out_ga_buffer,
-                self._out_gj_buffer,
-            )
-            # print(self.cspace_distance_weight.shape)
-            # print(cost)
-            # print(self._run_weight_acc)
+            if self.use_l2_kernel:
+                cost = WarpBoundSmoothL2Function.apply(
+                    state_batch.position,
+                    state_batch.velocity,
+                    state_batch.acceleration,
+                    state_batch.jerk,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.joint_limits.velocity,
+                    self.joint_limits.acceleration,
+                    self.joint_limits.jerk,
+                    self.weight,
+                    self.activation_distance,
+                    self.smooth_weight,
+                    self.cspace_distance_weight,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._run_weight_vel,
+                    self._run_weight_acc,
+                    self._run_weight_jerk,
+                    self._out_c_sum_buffer,
+                    self._out_gp_buffer,
+                    self._out_gv_buffer,
+                    self._out_ga_buffer,
+                    self._out_gj_buffer,
+                    self._l2_cost,
+                )
+            else:
+                cost = WarpBoundSmoothFunction.apply(
+                    state_batch.position,
+                    state_batch.velocity,
+                    state_batch.acceleration,
+                    state_batch.jerk,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.joint_limits.velocity,
+                    self.joint_limits.acceleration,
+                    self.joint_limits.jerk,
+                    self.weight,
+                    self.activation_distance,
+                    self.smooth_weight,
+                    self.cspace_distance_weight,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._run_weight_vel,
+                    self._run_weight_acc,
+                    self._run_weight_jerk,
+                    self._out_c_buffer,
+                    self._out_gp_buffer,
+                    self._out_gv_buffer,
+                    self._out_ga_buffer,
+                    self._out_gj_buffer,
+                )
        elif self.cost_type == BoundCostType.BOUNDS:
            cost = WarpBoundFunction.apply(
                state_batch.position,
@@ -237,8 +272,8 @@ class BoundCost(CostBase, BoundCostConfig):
                self._out_gj_buffer,
            )
        elif self.cost_type == BoundCostType.POSITION:
-            if self.return_loss:
-                cost = WarpBoundPosLoss.apply(
+            if self.use_l2_kernel:
+                cost = WarpBoundPosL2Function.apply(
                    state_batch.position,
                    retract_config,
                    retract_idx,
@@ -247,8 +282,10 @@ class BoundCost(CostBase, BoundCostConfig):
                    self.activation_distance,
                    self.null_space_weight,
                    self.vec_weight,
-                    self._out_c_buffer,
+                    self._out_c_sum_buffer,
                    self._out_gp_buffer,
+                    self._l2_cost,
+                    self.return_loss,
                )
            else:
                cost = WarpBoundPosFunction.apply(
@@ -262,6 +299,7 @@ class BoundCost(CostBase, BoundCostConfig):
                    self.vec_weight,
                    self._out_c_buffer,
                    self._out_gp_buffer,
+                    self.return_loss,
                )

        else:
@@ -458,6 +496,132 @@ class WarpBoundSmoothFunction(torch.autograd.Function):
        )


+# create a bound cost tensor:
+class WarpBoundSmoothL2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        vel,
+        acc,
+        jerk,
+        retract_config,
+        retract_idx,
+        p_b,
+        v_b,
+        a_b,
+        j_b,
+        weight,
+        activation_distance,
+        smooth_weight,
+        cspace_weight,
+        null_space_weight,
+        vec_weight,
+        run_weight_vel,
+        run_weight_acc,
+        run_weight_jerk,
+        out_cost,
+        out_gp,
+        out_gv,
+        out_ga,
+        out_gj,
+        warp_function,
+    ):
+        # scale the weights for smoothness by this dt:
+        wp_device = wp.device_from_torch(vel.device)
+        # assert smooth_weight.shape[0] == 7
+        b, h, dof = vel.shape
+        requires_grad = pos.requires_grad
+
+        wp.launch(
+            kernel=warp_function,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(vel.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(acc.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(jerk.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_b.view(-1), dtype=wp.float32),
+                wp.from_torch(v_b.view(-1), dtype=wp.float32),
+                wp.from_torch(a_b.view(-1), dtype=wp.float32),
+                wp.from_torch(j_b.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(smooth_weight, dtype=wp.float32),
+                wp.from_torch(cspace_weight, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_vel.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_acc.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_jerk.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gv.view(-1), dtype=wp.float32),
+                wp.from_torch(out_ga.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gj.view(-1), dtype=wp.float32),
+                requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(vel.device),
+        )
+        ctx.save_for_backward(out_gp, out_gv, out_ga, out_gj)
+
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (
+            p_grad,
+            v_grad,
+            a_grad,
+            j_grad,
+        ) = ctx.saved_tensors
+        v_g = None
+        a_g = None
+        p_g = None
+        j_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[1]:
+            v_g = v_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[2]:
+            a_g = a_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[3]:
+            j_g = j_grad  # * grad_out_cost#.unsqueeze(-1)
+        return (
+            p_g,
+            v_g,
+            a_g,
+            j_g,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
 class WarpBoundFunction(torch.autograd.Function):
    @staticmethod
    def forward(
@@ -581,6 +745,7 @@ class WarpBoundPosFunction(torch.autograd.Function):
        vec_weight,
        out_cost,
        out_gp,
+        return_loss=False,
    ):
        wp_device = wp.device_from_torch(pos.device)
        b, h, dof = pos.shape
@@ -607,10 +772,9 @@ class WarpBoundPosFunction(torch.autograd.Function):
            device=wp_device,
            stream=wp.stream_from_torch(pos.device),
        )
+        ctx.return_loss = return_loss
        ctx.save_for_backward(out_gp)
-        # cost = torch.linalg.norm(out_cost, dim=-1)
        cost = torch.sum(out_cost, dim=-1)
-        # cost = out_cost
        return cost

    @staticmethod
@@ -619,20 +783,71 @@ class WarpBoundPosFunction(torch.autograd.Function):

        p_g = None
        if ctx.needs_input_grad[0]:
-            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
-        return p_g, None, None, None, None, None, None, None, None, None
+            p_g = p_grad
+            if ctx.return_loss:
+                p_g = p_grad * grad_out_cost.unsqueeze(-1)
+
+        return p_g, None, None, None, None, None, None, None, None, None, None


 # create a bound cost tensor:
-class WarpBoundPosLoss(WarpBoundPosFunction):
+class WarpBoundPosL2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        retract_config,
+        retract_idx,
+        p_l,
+        weight,
+        activation_distance,
+        null_space_weight,
+        vec_weight,
+        out_cost,
+        out_gp,
+        warp_function,
+        return_loss=False,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        requires_grad = pos.requires_grad
+        wp.launch(
+            kernel=warp_function,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_l.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_gp)
+        # cost = torch.sum(out_cost, dim=-1)
+        return out_cost
+
    @staticmethod
    def backward(ctx, grad_out_cost):
        (p_grad,) = ctx.saved_tensors

        p_g = None
        if ctx.needs_input_grad[0]:
-            p_g = p_grad * grad_out_cost.unsqueeze(-1)
-        return p_g, None, None, None, None, None, None, None, None, None
+            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
+            if ctx.return_loss:
+                p_g = p_grad * grad_out_cost.unsqueeze(-1)
+        return p_g, None, None, None, None, None, None, None, None, None, None, None


@wp.kernel
@@ -1122,3 +1337,314 @@ def forward_bound_smooth_warp(
        out_grad_v[b_addrs] = g_v
        out_grad_a[b_addrs] = g_a
        out_grad_j[b_addrs] = g_j
+
+
+@get_cache_fn_decorator()
+def make_bound_pos_smooth_kernel(dof_template: int):
+    def forward_bound_smooth_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        vel: wp.array(dtype=wp.float32),
+        acc: wp.array(dtype=wp.float32),
+        jerk: wp.array(dtype=wp.float32),
+        retract_config: wp.array(dtype=wp.float32),
+        retract_idx: wp.array(dtype=wp.int32),
+        p_b: wp.array(dtype=wp.float32),
+        v_b: wp.array(dtype=wp.float32),
+        a_b: wp.array(dtype=wp.float32),
+        j_b: wp.array(dtype=wp.float32),
+        weight: wp.array(dtype=wp.float32),
+        activation_distance: wp.array(dtype=wp.float32),
+        smooth_weight: wp.array(dtype=wp.float32),
+        cspace_weight: wp.array(dtype=wp.float32),
+        null_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        run_weight_vel: wp.array(dtype=wp.float32),
+        run_weight_acc: wp.array(dtype=wp.float32),
+        run_weight_jerk: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        out_grad_v: wp.array(dtype=wp.float32),
+        out_grad_a: wp.array(dtype=wp.float32),
+        out_grad_j: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = int(0)
+
+        w = wp.float32(0.0)
+        b_wv = float(0.0)
+        b_wa = float(0.0)
+        b_wj = float(0.0)
+
+        r_wa = wp.float32(0.0)
+        r_wj = wp.float32(0.0)
+
+        w_a = wp.float32(0.0)
+        w_j = wp.float32(0.0)
+
+        s_a = wp.float32(0.0)
+        s_j = wp.float32(0.0)
+
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        n_w = wp.float32(0.0)
+        n_w = null_weight[0]
+
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        cspace_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        p_l = wp.vector(dtype=wp.float32, length=dof_template)
+        p_u = wp.vector(dtype=wp.float32, length=dof_template)
+        v_l = wp.vector(dtype=wp.float32, length=dof_template)
+        v_u = wp.vector(dtype=wp.float32, length=dof_template)
+        a_l = wp.vector(dtype=wp.float32, length=dof_template)
+        a_u = wp.vector(dtype=wp.float32, length=dof_template)
+        j_l = wp.vector(dtype=wp.float32, length=dof_template)
+        j_u = wp.vector(dtype=wp.float32, length=dof_template)
+        g_p = wp.vector(dtype=wp.float32, length=dof_template)
+        g_v = wp.vector(dtype=wp.float32, length=dof_template)
+        g_a = wp.vector(dtype=wp.float32, length=dof_template)
+        g_j = wp.vector(dtype=wp.float32, length=dof_template)
+        target_id = wp.int32(0.0)
+        target_id = retract_idx[b_id]
+        current_p = wp.vector(dtype=wp.float32, length=dof_template)
+        current_v = wp.vector(dtype=wp.float32, length=dof_template)
+        current_a = wp.vector(dtype=wp.float32, length=dof_template)
+        current_j = wp.vector(dtype=wp.float32, length=dof_template)
+
+        target_p = wp.vector(dtype=wp.float32, length=dof_template)
+
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        for i in range(dof_template):
+            vec_weight_local[i] = vec_weight[i]
+            target_p[i] = retract_config[target_id * dof + i]
+            current_p[i] = pos[b_addrs + i]
+            current_v[i] = vel[b_addrs + i]
+            current_a[i] = acc[b_addrs + i]
+            current_j[i] = jerk[b_addrs + i]
+            p_l[i] = p_b[i]
+            p_u[i] = p_b[dof + i]
+            v_l[i] = v_b[i]
+            v_u[i] = v_b[dof + i]
+            a_l[i] = a_b[i]
+            a_u[i] = a_b[dof + i]
+            j_l[i] = j_b[i]
+            j_u[i] = j_b[dof + i]
+            cspace_weight_local[i] = cspace_weight[i]
+
+        # read weights:
+        w = weight[0]
+        b_wv = weight[1]
+        b_wa = weight[2]
+        b_wj = weight[3]
+        r_wa = run_weight_acc[h_id]
+        r_wj = run_weight_jerk[h_id]
+
+        w_a = smooth_weight[1]
+        w_j = smooth_weight[2]
+
+        # compute cost:
+
+        # read buffers:
+
+        # if w_j > 0.0:
+        eta_p = activation_distance[0]
+        eta_v = activation_distance[1]
+        eta_a = activation_distance[2]
+        eta_j = activation_distance[3]
+
+        p_range = p_u - p_l
+        p_l = p_l + (p_range) * eta_p
+        p_u = p_u - (p_range) * eta_p
+        v_l = v_l + (v_u - v_l) * eta_v
+        v_u = v_u - (v_u - v_l) * eta_v
+        a_l = a_l + (a_u - a_l) * eta_a
+        a_u = a_u - (a_u - a_l) * eta_a
+        j_l = j_l + (j_u - j_l) * eta_j
+        j_u = j_u - (j_u - j_l) * eta_j
+
+        # position:
+        if n_w > 0.0:
+            error = wp.cw_mul(vec_weight_local, current_p - target_p)
+            error_length = wp.length(error)
+            c_total = n_w * error_length
+            if error_length > 0.0:
+                g_p = n_w * error / error_length
+
+        # bound cost:
+        # bound cost:
+        bound_delta_p = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_v = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_a = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_j = wp.vector(dtype=wp.float32, length=dof_template)
+        for i in range(dof_template):
+            if current_p[i] < p_l[i]:
+                bound_delta_p[i] = current_p[i] - p_l[i]
+            elif current_p[i] > p_u[i]:
+                bound_delta_p[i] = current_p[i] - p_u[i]
+            if current_v[i] < v_l[i]:
+                bound_delta_v[i] = current_v[i] - v_l[i]
+            elif current_v[i] > v_u[i]:
+                bound_delta_v[i] = current_v[i] - v_u[i]
+            if current_a[i] < a_l[i]:
+                bound_delta_a[i] = current_a[i] - a_l[i]
+            elif current_a[i] > a_u[i]:
+                bound_delta_a[i] = current_a[i] - a_u[i]
+            if current_j[i] < j_l[i]:
+                bound_delta_j[i] = current_j[i] - j_l[i]
+            elif current_j[i] > j_u[i]:
+                bound_delta_j[i] = current_j[i] - j_u[i]
+        delta_p = wp.length(bound_delta_p)
+        if delta_p > 0.0:
+            g_p += w * bound_delta_p / delta_p
+            c_total += w * delta_p
+        delta_v = wp.length(bound_delta_v)
+        if delta_v > 0.0:
+            g_v = b_wv * bound_delta_v / delta_v
+            c_total += b_wv * delta_v
+        delta_a = wp.length(bound_delta_a)
+        if delta_a > 0.0:
+            g_a = b_wa * bound_delta_a / delta_a
+            c_total += b_wa * delta_a
+        delta_j = wp.length(bound_delta_j)
+        if delta_j > 0.0:
+            g_j = b_wj * bound_delta_j / delta_j
+            c_total += b_wj * delta_j
+
+        delta_acc = wp.cw_mul(cspace_weight_local, current_a)
+        # delta_acc = wp.cw_div(delta_acc, a_u - a_l)
+        acc_length = wp.length_sq(delta_acc)
+        s_a = w_a * r_wa * acc_length
+        if acc_length > 0.0:
+            g_a += 2.0 * w_a * r_wa * delta_acc  # / acc_length
+
+        delta_jerk = wp.cw_mul(cspace_weight_local, current_j)
+        # delta_jerk = wp.cw_div(delta_jerk, j_u - j_l)
+        jerk_length = wp.length_sq(delta_jerk)
+        s_j = w_j * r_wj * jerk_length
+        if jerk_length > 0.0:
+            g_j += 2.0 * w_j * r_wj * delta_jerk  # / jerk_length
+
+        c_total += s_a + s_j
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            b_addrs = b_id * horizon * dof + h_id * dof
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+                out_grad_v[b_addrs + i] = g_v[i]
+                out_grad_a[b_addrs + i] = g_a[i]
+                out_grad_j[b_addrs + i] = g_j[i]
+
+    return wp.Kernel(forward_bound_smooth_loop_warp)
+
+
+@get_cache_fn_decorator()
+def make_bound_pos_kernel(dof_template: int):
+
+    def forward_bound_pos_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        retract_config: wp.array(dtype=wp.float32),
+        retract_idx: wp.array(dtype=wp.int32),
+        p_b: wp.array(dtype=wp.float32),
+        weight: wp.array(dtype=wp.float32),
+        activation_distance: wp.array(dtype=wp.float32),
+        null_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = int(0)
+
+        w = wp.float32(0.0)
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        # read weights:
+        eta_p = activation_distance[0]
+        w = weight[0]
+
+        n_w = wp.float32(0.0)
+        n_w = null_weight[0]
+        target_id = wp.int32(0.0)
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        p_l = wp.vector(dtype=wp.float32, length=dof_template)
+        p_u = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta = wp.vector(dtype=wp.float32, length=dof_template)
+        target_p = wp.vector(dtype=wp.float32, length=dof_template)
+        current_p = wp.vector(dtype=wp.float32, length=dof_template)
+        g_p = wp.vector(dtype=wp.float32, length=dof_template)
+
+        target_id = retract_idx[b_id]
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        for i in range(dof_template):
+            vec_weight_local[i] = vec_weight[i]
+            target_p[i] = retract_config[target_id * dof + i]
+            current_p[i] = pos[b_addrs + i]
+            p_l[i] = p_b[i]
+            p_u[i] = p_b[dof + i]
+
+        p_range = p_u - p_l
+        eta_percent = eta_p * (p_range)
+        p_l += eta_percent
+        p_u -= eta_percent
+
+        # compute retract cost:
+
+        if n_w > 0.0:
+            error = wp.cw_mul(vec_weight_local, current_p - target_p)
+            error_length = wp.length(error)
+            c_total = n_w * error_length
+            if error_length > 0.0:
+                g_p = n_w * error / error_length
+
+        # bound cost:
+
+        for i in range(dof_template):
+            if current_p[i] < p_l[i]:
+                bound_delta[i] = current_p[i] - p_l[i]
+            elif current_p[i] > p_u[i]:
+                bound_delta[i] = current_p[i] - p_u[i]
+
+        delta = wp.length(bound_delta)
+        if delta > 0.0:
+            g_p += w * bound_delta / delta
+            c_total += w * delta
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            b_addrs = b_id * horizon * dof + h_id * dof
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+
+    return wp.Kernel(forward_bound_pos_loop_warp)
--- a/src/curobo/rollout/cost/dist_cost.py
+++ b/src/curobo/rollout/cost/dist_cost.py
@@ -18,7 +18,8 @@ import torch
 import warp as wp

 # CuRobo
-from curobo.util.torch_utils import get_torch_jit_decorator
+from curobo.util.logger import log_error
+from curobo.util.torch_utils import get_cache_fn_decorator, get_torch_jit_decorator
 from curobo.util.warp import init_warp

 # Local Folder
@@ -37,6 +38,7 @@ class DistType(Enum):
 class DistCostConfig(CostConfig):
    dist_type: DistType = DistType.L2
    use_null_space: bool = False
+    use_l2_kernel: bool = True

    def __post_init__(self):
        return super().__post_init__()
@@ -142,6 +144,89 @@ def forward_l2_warp(
        out_grad_p[b_addrs] = g_p


+@get_cache_fn_decorator()
+def make_l2_kernel(dof_template: int):
+    def forward_l2_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        target: wp.array(dtype=wp.float32),
+        target_idx: wp.array(dtype=wp.int32),
+        weight: wp.array(dtype=wp.float32),
+        run_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = wp.int32(0)
+        target_id = wp.int32(0)
+        w = wp.float32(0.0)
+        r_w = wp.float32(0.0)
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        # read weights:
+        w = weight[0]
+        r_w = run_weight[h_id]
+        w = r_w * w
+
+        if w == 0.0:
+            return
+        # compute cost:
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        # read buffers:
+        current_position = wp.vector(dtype=wp.float32, length=dof_template)
+        target_position = wp.vector(dtype=wp.float32, length=dof_template)
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        target_id = target_idx[b_id]
+        target_id = target_id * dof
+
+        for i in range(dof_template):
+            current_position[i] = pos[b_addrs + i]
+            target_position[i] = target[target_id + i]
+            vec_weight_local[i] = vec_weight[i]
+
+        error = wp.cw_mul(vec_weight_local, (current_position - target_position))
+
+        c_length = wp.length(error)
+        if w > 100.0:
+            p_w_alpha = 70.0
+            c_total = w * wp.log2(wp.cosh(p_w_alpha * c_length))
+            g_p = error * (
+                w
+                * p_w_alpha
+                * wp.sinh(p_w_alpha * c_length)
+                / (c_length * wp.cosh(p_w_alpha * c_length))
+            )
+        else:
+            g_p = w * error
+            if c_length > 0.0:
+                g_p = g_p / c_length
+            c_total = w * c_length
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+
+    return wp.Kernel(forward_l2_loop_warp)
+
+
 # create a bound cost tensor:
 class L2DistFunction(torch.autograd.Function):
    @staticmethod
@@ -195,6 +280,58 @@ class L2DistFunction(torch.autograd.Function):
        return p_g, None, None, None, None, None, None, None, None


+# create a bound cost tensor:
+class L2DistLoopFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        target,
+        target_idx,
+        weight,
+        run_weight,
+        vec_weight,
+        out_cost,
+        out_cost_v,
+        out_gp,
+        l2_dof_kernel,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        wp.launch(
+            kernel=l2_dof_kernel,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1).contiguous(), dtype=wp.float32),
+                wp.from_torch(target.view(-1), dtype=wp.float32),
+                wp.from_torch(target_idx.view(-1), dtype=wp.int32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(run_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+
+        ctx.save_for_backward(out_gp)
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (p_grad,) = ctx.saved_tensors
+
+        p_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad
+        return p_g, None, None, None, None, None, None, None, None, None
+
+
 class DistCost(CostBase, DistCostConfig):
    def __init__(self, config: Optional[DistCostConfig] = None):
        if config is not None:
@@ -202,6 +339,8 @@ class DistCost(CostBase, DistCostConfig):
        CostBase.__init__(self)
        self._init_post_config()
        init_warp()
+        if self.use_l2_kernel:
+            self._l2_dof_kernel = make_l2_kernel(self.dof)

    def _init_post_config(self):
        if self.vec_weight is not None:
@@ -210,13 +349,21 @@ class DistCost(CostBase, DistCostConfig):
                self.vec_weight = self.vec_weight * 0.0 + 1.0

    def update_batch_size(self, batch, horizon, dof):
+        if dof != self.dof:
+            log_error("dof cannot be changed after initializing DistCost")
        if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
-            self._out_cv_buffer = torch.zeros(
-                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
-            )
-            self._out_c_buffer = torch.zeros(
-                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
-            )
+            self._out_c_buffer = None
+            self._out_cv_buffer = None
+            if self.use_l2_kernel:
+                self._out_c_buffer = torch.zeros(
+                    (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+            else:
+                self._out_cv_buffer = torch.zeros(
+                    (batch, horizon, dof),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )

            self._out_g_buffer = torch.zeros(
                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
@@ -293,24 +440,59 @@ class DistCost(CostBase, DistCostConfig):
        else:
            raise NotImplementedError("terminal flag needs to be set to true")
        if self.dist_type == DistType.L2:
-            # print(goal_idx.shape, goal_vec.shape)
-            cost = L2DistFunction.apply(
-                current_vec,
-                goal_vec,
-                goal_idx,
-                self.weight,
-                self._run_weight_vec,
-                self.vec_weight,
-                self._out_c_buffer,
-                self._out_cv_buffer,
-                self._out_g_buffer,
-            )
+            if self.use_l2_kernel:
+
+                cost = L2DistLoopFunction.apply(
+                    current_vec,
+                    goal_vec,
+                    goal_idx,
+                    self.weight,
+                    self._run_weight_vec,
+                    self.vec_weight,
+                    self._out_c_buffer,
+                    None,
+                    self._out_g_buffer,
+                    self._l2_dof_kernel,
+                )
+            else:
+                cost = L2DistFunction.apply(
+                    current_vec,
+                    goal_vec,
+                    goal_idx,
+                    self.weight,
+                    self._run_weight_vec,
+                    self.vec_weight,
+                    None,
+                    self._out_cv_buffer,
+                    self._out_g_buffer,
+                )

        else:
            raise NotImplementedError()
        if RETURN_GOAL_DIST:
-            dist_scale = torch.nan_to_num(
-                1.0 / torch.sqrt((self.weight * self._run_weight_vec)), 0.0
-            )
-            return cost, cost * dist_scale
+
+            if self.use_l2_kernel:
+                distance = weight_cost_to_l2_jit(cost, self.weight, self._run_weight_vec)
+            else:
+                distance = squared_cost_to_l2_jit(cost, self.weight, self._run_weight_vec)
+            return cost, distance
+
        return cost
+
+
+@get_torch_jit_decorator()
+def squared_cost_to_l2_jit(cost, weight, run_weight_vec):
+    weight_inv = weight * run_weight_vec
+    weight_inv = 1.0 / weight_inv
+    weight_inv = torch.nan_to_num(weight_inv, 0.0)
+    distance = torch.sqrt(cost * weight_inv)
+    return distance
+
+
+@get_torch_jit_decorator()
+def weight_cost_to_l2_jit(cost, weight, run_weight_vec):
+    weight_inv = weight * run_weight_vec
+    weight_inv = 1.0 / weight_inv
+    weight_inv = torch.nan_to_num(weight_inv, 0.0)
+    distance = cost * weight_inv
+    return distance
--- a/src/curobo/rollout/dynamics_model/tensor_step.py
+++ b/src/curobo/rollout/dynamics_model/tensor_step.py
@@ -522,27 +522,20 @@ class TensorStepPositionCliqueKernel(TensorStepBase):
                    self._u_grad,
                )
            if self._filter_velocity:
+                out_state_seq.aux_data["raw_velocity"] = out_state_seq.velocity
                out_state_seq.velocity = self.filter_signal(out_state_seq.velocity)

            if self._filter_acceleration:
+                out_state_seq.aux_data["raw_acceleration"] = out_state_seq.acceleration
                out_state_seq.acceleration = self.filter_signal(out_state_seq.acceleration)

            if self._filter_jerk:
+                out_state_seq.aux_data["raw_jerk"] = out_state_seq.jerk
                out_state_seq.jerk = self.filter_signal(out_state_seq.jerk)
        return out_state_seq

    def filter_signal(self, signal: torch.Tensor):
        return filter_signal_jit(signal, self._sma_kernel)
-        b, h, dof = signal.shape
-        new_signal = (
-            self._sma(
-                signal.transpose(-1, -2).reshape(b * dof, 1, h), self._sma_kernel, padding="same"
-            )
-            .view(b, dof, h)
-            .transpose(-1, -2)
-            .contiguous()
-        )
-        return new_signal


@get_torch_jit_decorator(force_jit=True)