add debug code for replay

update the image size for inference
update benchmark view point
2026-06-15 14:58:28 +08:00 · 2026-06-12 14:36:13 +08:00 · 2026-05-25 18:12:37 +08:00
3 changed files with 69 additions and 20 deletions
--- a/benchmark.yaml
+++ b/benchmark.yaml
@@ -29,7 +29,7 @@ scene:
      - 0.001
      - 0.001
      position:
-      - 0.419859
+      - 0.15
      - -4.02430000000001
      - 0.510259093
      rotation:
@@ -47,7 +47,7 @@ scene:
      - 0.001
      - 0.001
      position:
-      - 0.419859
+      - 0.15
      - -4.152430000000001
      - 0.510259093
      quaternion: [1, 0, 0, 0]
@@ -57,7 +57,7 @@ scene:
    r1pro_dex:
      name: r1pro_dex
      asset_path: asset://robots/r1pro/r1pro_dex.usd
-      position: [-0.5, -4.0, 0.0]
+      position: [-0.2, -4.1, 0.0]
      rotation: [1, 0, 0, 0]
      stereotype: modular_robot
      source: local
@@ -67,18 +67,18 @@ scene:
        torso_joint2: 0.0
        torso_joint3: 0.0
        torso_joint4: 0.0
-        left_arm_joint1: 0.0
-        left_arm_joint2: 0.5
+        left_arm_joint1: -0.2
+        left_arm_joint2: 0.05
        left_arm_joint3: 0.0
        left_arm_joint4: -1.0
        left_arm_joint5: 0.0
        left_arm_joint6: 0.0
        left_arm_joint7: 0.0
-        right_arm_joint1: 0.0
-        right_arm_joint2: -0.5
+        right_arm_joint1: -0.2
+        right_arm_joint2: -0.05
        right_arm_joint3: 0.0
        right_arm_joint4: -1.0
-        right_arm_joint5: 0.0
+        right_arm_joint5: 0.1
        right_arm_joint6: 0.0
        right_arm_joint7: 0.0

@@ -137,22 +137,33 @@ scene:
    head_camera:
      name: head_camera
      stereotype: camera
-      position: [-0.4, -4.0, 1.2]
-      look_at:
-        is_point: true
-        look_at_point: [0.4200, -4.1530, 0.4885]
      data_types: [rgb]
      width: 1280
      height: 720
      camera_model: pinhole
      fix_camera: true
+      focal_length: 2.8
+      horizontal_aperture: 4.890881131191918
+      vertical_aperture: 2.7608816125932627
+      convention: opengl
+      attach_to:
+        target_name: r1pro_dex
+        is_articulation_part: true
+        articulation_part_name: zed_link
+        create_fixed_joint: true
+        local_position: [0.0, 0.0, 0.0]
+        local_rotation:
+        - 0.33
+        - 1.0
+        - -0.0
+        - 0.0
    front_camera:
      name: front_camera
      stereotype: camera
-      position: [1.0, -4.0, 1.5]
+      position: [2, -4.1, 1.8]
      look_at:
        is_point: true
-        look_at_point: [-1.0, -4.0, 1.2]
+        look_at_point: [0.0, -4.1, 1.2]
      data_types: [rgb]
      width: 1280
      height: 720
@@ -161,7 +172,7 @@ scene:
    left_camera:
      name: left_camera
      stereotype: camera
-      position: [-1.0, -1.0, 1.2]
+      position: [-0.58554, -2.0, 1.8]
      look_at:
        is_point: true
        look_at_point: [0.0, -4.1, 1.2]
@@ -173,7 +184,7 @@ scene:
    right_camera:
      name: right_camera
      stereotype: camera
-      position: [-1.0, -6.5, 1.2]
+      position: [0.36816, -5.36, 1.8]
      look_at:
        is_point: true
        look_at_point: [0.0, -4.1, 1.2]
--- a/starvla_inference_server.py
+++ b/starvla_inference_server.py
@@ -129,7 +129,7 @@ class StarvlaInferenceServer:
    def inference(self, observation: dict) -> dict:
        
        img_head, state_vec, prompt = \
-            self.parse_observation(observation)
+            self.parse_observation(observation, target_size=(410, 224))
        vla_input = {
            # "batch_images": [[img_left, img_right, img_wrist]],
            "image": [img_head],
--- a/starvla_policy.py
+++ b/starvla_policy.py
@@ -2,6 +2,7 @@ import pickle
 import time
 import json
 import numpy as np
+from scipy.spatial.transform import Rotation as R
 import requests

 from fastsim.annotations.config_class import configclass, field
@@ -70,6 +71,11 @@ class StarvlaPolicy(Policy):
        self.visualize_action_ee_pose = config.visualize_action_ee_pose
        self.visualize_state_ee_pose = config.visualize_state_ee_pose
        self.visualize_bounding_box_targets = list(config.visualize_bounding_box_targets or [])
+        # prevent circular import
+        import pandas as pd
+        df_data = pd.read_parquet("/home/zhiyuan/zhujuan/datasets/add_remove_lid_15fps_10epi/data/chunk-000/file-000.parquet")
+        self.dummy_data = np.array(df_data.groupby('episode_index')['observation.state'].apply(list).to_dict()[0])
+        self.dummy_data_idx = 0

    def reset(self) -> None:
        self.current_state = {}
@@ -167,6 +173,35 @@ class StarvlaPolicy(Policy):

    def postprocess_action(self, action: dict) -> BenchmarkAction:
        benchmark_action = BenchmarkAction()
+        read_chunk_size = 1
+        dummy_action = self.dummy_data[self.dummy_data_idx:(self.dummy_data_idx + read_chunk_size)]
+        if self.dummy_data_idx + read_chunk_size >= self.dummy_data.shape[0]:
+            self.dummy_data_idx = 0
+            exit(0)
+        else:
+            self.dummy_data_idx += read_chunk_size
+        read_chunk_id = 0
+        print(f'{self.current_chunk_id=}, {self.dummy_data_idx = }, {read_chunk_id=}')
+        time.sleep(1.0)
+
+        left_rpy_state = dummy_action[:, 3:6]         # (3,)
+        right_rpy_state = dummy_action[:, 31:34]         # (3,)
+
+        left_rot_state = R.from_euler('xyz', left_rpy_state).as_matrix()
+        right_rot_state = R.from_euler('xyz', right_rpy_state).as_matrix()
+
+        left_state_rot6d = np.concatenate([left_rot_state[:, 0], left_rot_state[:, 1]], axis=-1) # (6,)
+        right_state_rot6d = np.concatenate([right_rot_state[:, 0], right_rot_state[:, 1]], axis=-1) # (6,)
+
+        read_state = {"left_arm": {
+                    "ee_position_chunks": dummy_action[:, :3].tolist(),
+                    "ee_rot6d_chunks": left_state_rot6d.tolist(),
+                    "finger_chunks": dummy_action[:, 6:28].tolist()},
+                "right_arm": {
+                    "ee_position_chunks": dummy_action[:, 28:31].tolist(),
+                    "ee_rot6d_chunks": right_state_rot6d.tolist(),
+                    "finger_chunks": dummy_action[:, 34:56].tolist()}
+                }
        for arm_key in self.robot['arms'].keys():
            action_arm = action[arm_key]
            delta_ee_pose = Pose(position=action_arm["ee_delta_position_chunks"][self.current_chunk_id], rot6d=action_arm["ee_delta_rot6d_chunks"][self.current_chunk_id])
@@ -174,19 +209,22 @@ class StarvlaPolicy(Policy):
            curr_action_ee_pose = curr_state_ee_pose * delta_ee_pose # action2base = state2base * action2state
            finger_joint_qpos = action_arm["finger_chunks"][self.current_chunk_id] + self.current_state[arm_key]["finger_qpos"] 
            joint_names = self.left_hand_joints if arm_key == "left_arm" else self.right_hand_joints
+            state_arm = read_state[arm_key]
            benchmark_action.add_robot_action(
                RobotAction(
                    control_mode=ControlMode.POSITION,
                    robot_name=self.robot_name,
                    joint_names=joint_names,
-                    joint_positions=finger_joint_qpos
+                    # joint_positions=finger_joint_qpos
+                    joint_positions=state_arm["finger_chunks"][read_chunk_id]
                )
            )
            benchmark_action.add_robot_action(
                RobotAction(
                    control_mode=ControlMode.EE_POSE,
                    robot_name=self.robot_name,
-                    ee_pose=curr_action_ee_pose,
+                    # ee_pose=curr_action_ee_pose,
+                    ee_pose=Pose(position=state_arm["ee_position_chunks"][read_chunk_id], rot6d=state_arm["ee_rot6d_chunks"][read_chunk_id]),
                    arm_name=arm_key
                )
            )
Author	SHA1	Message	Date
Zhu Juan	e99f6a0589	add debug code for replay	2026-06-15 14:58:28 +08:00
Zhu Juan	cbf0edfcae	update the image size for inference	2026-06-12 14:36:13 +08:00
Zhu Juan	87f2a2abfc	update benchmark view point	2026-05-25 18:12:37 +08:00