From 1b30c3f96a5e7bc043870008fd1e086be4b8acc9 Mon Sep 17 00:00:00 2001
From: "hufei.hofee" <hufei.hofee@jd.com>
Date: Thu, 19 Mar 2026 01:43:02 +0800
Subject: [PATCH] finish benchmark debug

---
 .gitignore                  |   3 +-
 Readme.md                   |  28 +++++
 benchmark.yaml              | 234 +++++++++++++++++++++---------------
 starvla_inference_server.py |  86 ++++++-------
 starvla_policy.py           |  95 ++++++++-------
 5 files changed, 257 insertions(+), 189 deletions(-)
 create mode 100644 Readme.md

diff --git a/.gitignore b/.gitignore
index 600d2d3..83c4619 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-.vscode
\ No newline at end of file
+.vscode
+__pycache__/
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..ab0607a
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,28 @@
+# StarVLA Benchmark
+
+## 1 Installation
+
+### 1.1 Install Fastsim
+
+Install Fastsim from: [https://git.hofee.cn/hofee/fastsim.git](https://git.hofee.cn/hofee/fastsim.git)
+
+### 1.2 Install StarVLA
+
+Install edited version of StarVLA from `XiongHao Wu`
+
+## 2 Run Benchmark
+
+### 2.1 Run FastSim
+
+```bash
+conda activate fastsim
+joysim launch_simulation --config ./benchmark.yaml # policy will be auto-scanned.
+```
+
+### 2.2 Run StarVLA inference server
+
+```bash
+conda activate starvla
+python starvla_inference_server.py --config ./benchmark.yaml
+```
+
diff --git a/benchmark.yaml b/benchmark.yaml
index 8affca8..4ab18a7 100644
--- a/benchmark.yaml
+++ b/benchmark.yaml
@@ -2,7 +2,6 @@ general:
   scan_project: true
   root_paths:
     asset: /home/ubuntu/projects/gen_data/data
-    output: /home/ubuntu/output
     checkpoints: /home/ubuntu/data/models
 
 simulation:
@@ -13,160 +12,197 @@ simulation:
     livestream: 0
     
 scene:
-  name: default_scene_name
-  position: [0, 0, 0]
-  rotation: [1, 0, 0, 0]
+  name: kujiale_multispace
   base_config:
-    name: default_base
-    source: primitive
-    stereotype: ground_plane
-    ground_size: [100,100]
-
+    stereotype: usd
+    name: _827313_home_workspace_00
+    source: local
+    asset_path: asset://scenes/kujiale_multispace/827313_home/workspace_00.usd
   object_cfg_dict:
-    table:
-      name: simple_table
-      position: [0.5, 0, 0.25]
-      source: primitive
-      stereotype: rigid
-      primitive_type: cuboid
-      primitive_size: [0.5, 1, 0.5]
-      mass: 1e4
-
-    target:
-      name: target
-      position: [0.4, 0.0, 0.5]
-      scale: [0.001, 0.001, 0.001]
-      axis_y_up: true
-      asset_path: asset://objects/omni6DPose/ball/omni6DPose_ball_020/Aligned.usd
+    omni6DPose_can_016:
+      name: omni6DPose_can_016
       stereotype: rigid
       source: local
-
-
+      asset_path: asset://objects/omni6DPose/can/omni6DPose_can_016/Aligned.usd
+      scale:
+      - 0.001
+      - 0.001
+      - 0.001
+      position:
+      - 0.0
+      - -3.79243
+      - 0.5
+      quaternion:
+      - -0.304408012043137
+      - -0.304408012043137
+      - 0.638228612805745
+      - 0.6382286128057448
+      axis_y_up: true
+    omni6DPose_plug_001:
+      name: omni6DPose_plug_001
+      stereotype: rigid
+      source: local
+      asset_path: asset://objects/omni6DPose/plug/omni6DPose_plug_001/Aligned.usd
+      scale:
+      - 0.001
+      - 0.001
+      - 0.001
+      position:
+      - 0.419859
+      - -4.152430000000001
+      - 0.510259093
+      quaternion:
+      - 0.7070997233636068
+      - 0.707099723363607
+      - 0.003159306745230588
+      - 0.003159306745230588
+      axis_y_up: true
   robot_cfg_dict:
-    robot:
-      name: my_robot
+    Franka:
+      name: Franka
       asset_path: asset://Franka/franka_robotiq_2f85_zedmini.usd
-      position: [0, 0, 0]
+      position:
+      - -0.3779152929316859
+      - -3.943187336951741
+      - 0.3130090550954169
+      rotation:
+      - 0.9987576855319703
+      - 0.0
+      - 0.0
+      - 0.04983056883903615
       stereotype: single_gripper_arm_robot
       source: local
       init_joint_position:
         panda_joint2: -0.1633
-        panda_joint4: -1.070
+        panda_joint4: -1.07
         panda_joint6: 0.8933
         panda_joint7: 0.785
-      
       arm_actuator_name: franka_arm
       gripper_actuator_name: robotiq_2f_85
-
       use_planner: true
       planner_cfg:
         stereotype: curobo
         lazy_init: true
         robot_config_file: asset://curobo/franka_robotiq_2f85/franka_robotiq_2f85.yml
         world_config_source: stage
-        world_stage_ignore_substrings: [my_robot]
-        world_stage_only_paths: [/World]
-        world_stage_reference_prim_path: /World/Robot/SingleGripperArmRobot/my_robot
-
   sensor_cfg_dict:
-    front_camera:
-      name: front_camera
+    Hand_Camera:
+      name: Hand_Camera
       stereotype: camera
-      position: [0.8, 0.0, 0.8]
-      data_types: [rgb, depth, normals]
+      data_types:
+      - rgb
+      - depth
+      - normals
       width: 1280
       height: 720
       camera_model: pinhole
       fix_camera: true
-
-    left_camera:
-      name: left_camera
+      focal_length: 2.8
+      horizontal_aperture: 4.890881131191918
+      vertical_aperture: 2.7608816125932627
+      convention: opengl
+      attach_to:
+        target_name: Franka
+        is_articulation_part: true
+        articulation_part_name: panda_link8
+        create_fixed_joint: true
+        local_position:
+        - -0.07176474936469446
+        - 0.02890958100382394
+        - 0.01978286477078585
+        local_rotation:
+        - 0.12352531576657892
+        - 0.7000519980574638
+        - -0.6933396337721066
+        - -0.11810524383481495
+    Left_Camera:
+      name: Left_Camera
       stereotype: camera
-      position: [0.6, 0.7, 0.8]
-      data_types: [rgb, depth, normals]
+      data_types:
+      - rgb
+      - depth
+      - normals
       width: 1280
       height: 720
       camera_model: pinhole
-      fix_camera: true
-
-    right_camera:
-      name: right_camera
+      fix_camera: false
+      focal_length: 2.1
+      horizontal_aperture: 5.019302546405283
+      vertical_aperture: 2.833796298140747
+      convention: opengl
+      attach_to:
+        target_name: Franka
+        local_position:
+        - -0.07383269512283744
+        - -0.4566116797983716
+        - 0.5664518136443555
+        local_rotation:
+        - 0.7669289562134721
+        - 0.4437472984144337
+        - -0.232167881174614
+        - -0.4012560108236337
+    Right_Camera:
+      name: Right_Camera
       stereotype: camera
-      position: [0.6, -0.7, 0.8]
-      data_types: [rgb, depth, normals]
+      data_types:
+      - rgb
+      - depth
+      - normals
       width: 1280
       height: 720
       camera_model: pinhole
-      fix_camera: true
+      fix_camera: false
+      focal_length: 2.1
+      horizontal_aperture: 5.053278483887542
+      vertical_aperture: 2.833796298140747
+      convention: opengl
+      attach_to:
+        target_name: Franka
+        local_position:
+        - 0.4501524009531093
+        - 0.7206899873248545
+        - 0.27525652672526973
+        local_rotation:
+        - 0.007056820167439314
+        - 0.007107689842383092
+        - 0.709606075042274
+        - 0.7045274304789896
 
 extension:
   extension_cfg_dict:
-    my_data_collect:
+    benchmark_data_collect:
       enable: true
       stereotype: data_collect
       observer_cfgs:
         - stereotype: robot_observer
-          name: my_robot
-          observe_joint_positions: true
-          observe_joint_velocities: true
-          observe_joint_accelerations: true
-          observe_joint_position_targets: true
-          observe_joint_velocity_targets: true
-          observe_position: true
-          observe_rotation: true
+          name: Franka
           observe_ee_pose: true
           observe_gripper_state: true
           observe_gripper_drive_state: true
         - stereotype: sensor_observer
-          name: front_camera
-          observe_intrinsic_matrix: true
-          observe_extrinsic_matrix: true
+          name: Hand_Camera
           observe_rgb: true
-          observe_depth: true
-          observe_normals: true
         - stereotype: sensor_observer
-          name: left_camera
-          observe_intrinsic_matrix: true
-          observe_extrinsic_matrix: true
+          name: Left_Camera
           observe_rgb: true
-          observe_depth: true
-          observe_normals: true
         - stereotype: sensor_observer
-          name: right_camera
-          observe_intrinsic_matrix: true
-          observe_extrinsic_matrix: true
+          name: Right_Camera
           observe_rgb: true
-          observe_depth: true
-          observe_normals: true
-
-        - stereotype: task_observer
-          name: task
-
-        - stereotype: object_observer
-          name: target
-          observe_position: true
-          observe_rotation: true
-          observe_scale: true
           
-    my_benchmark:
+    starvla_benchmark:
       enable: true
       stereotype: benchmark
-      data_collector_name: my_data_collect
+      data_collector_name: benchmark_data_collect
       goals:
-        - name: reach_target
-          description: Reach the target
-          stereotype: pose
-          pose_A_source: ee
-          pose_A_params: 
-            robot_name: my_robot
-          pose_B_source: spawnable
-          pose_B_params:
-            spawnable_name: target
-          position_tolerance: 0.005
+        - name: cola on top of book
+          description: check if the cola bottle is on the book
+          stereotype: on_top
+          object_A_name: omni6DPose_plug_001
+          object_B_name: omni6DPose_can_016
       policy:
         stereotype: starvla
-        robot_name: my_robot
-        object_name: target
+        robot_name: Franka
+        sensor_names: [Hand_Camera, Left_Camera, Right_Camera]
         prompt: pick the cola bottle and place it on the book
 
 policy_server:
diff --git a/starvla_inference_server.py b/starvla_inference_server.py
index d489bfa..f4f674b 100644
--- a/starvla_inference_server.py
+++ b/starvla_inference_server.py
@@ -1,7 +1,9 @@
 
 import yaml
 import pickle
+import argparse
 import os
+import traceback
 from urllib.parse import urlparse
 import numpy as np
 import torch
@@ -77,49 +79,17 @@ class StarvlaInferenceServer:
 
         return model
 
-    def parse_observation(self, obs):
+    def parse_observation(self, obs, target_size=(320, 180)):
 
-        rgb = obs["rgb"][-1]
-        state = obs["state"][-1]
-        joint = obs.get("joint", None)
-        prompt = obs["prompt"]
+        left_rgb, right_rgb, wrist_rgb = obs["rgb"]["Left_Camera"], obs["rgb"]["Right_Camera"], obs["rgb"]["Hand_Camera"]
 
-        left = rgb[:, :, :3]
-        right = rgb[:, :, 3:6]
-        wrist = rgb[:, :, 6:9]
+        img_left = Image.fromarray(cv2.resize(left_rgb, target_size))
+        img_right = Image.fromarray(cv2.resize(right_rgb, target_size))
+        img_wrist = Image.fromarray(cv2.resize(wrist_rgb, target_size))
 
-        target_size = (320, 180)
+        state_vec = obs["state"]
 
-        left = cv2.resize(left, target_size)
-        right = cv2.resize(right, target_size)
-        wrist = cv2.resize(wrist, target_size)
-
-        img_left = Image.fromarray(left)
-        img_right = Image.fromarray(right)
-        img_wrist = Image.fromarray(wrist)
-
-        if self.state_mode == "joint8":
-
-            joint_last = joint[-1]
-            gripper = state[9]
-
-            state_vec = np.concatenate(
-                [joint_last, np.array([gripper])],
-                axis=0
-            )
-
-        else:
-
-            xyz = state[0:3]
-            rot6d = state[3:9]
-            gripper = state[9]
-
-            state_vec = np.concatenate(
-                [xyz, rot6d[:3], np.array([gripper])],
-                axis=0
-            )
-
-        return img_left, img_right, img_wrist, state_vec, prompt
+        return img_left, img_right, img_wrist, state_vec, obs["prompt"]
 
     def inference(self, observation: dict) -> dict:
 
@@ -141,18 +111,31 @@ class StarvlaInferenceServer:
             actions = actions.cpu().numpy()
 
         if actions.ndim == 3:
-            actions = actions[0]
-
-        return {"action": actions.astype(np.float32)}
+            actions = actions[0] # (8, 7)
+        return {"ee_delta_position_chunks": actions[:, :3].tolist(), 
+                "ee_delta_euler_xyz_chunks": actions[:, 3:6].tolist(),
+                "gripper_chunks": actions[:, 6:7].tolist()}
 
     def register_routes(self):
 
         @self.app.route("/policy", methods=["POST"])
         def policy():
-            data = pickle.loads(request.data)
-            result = self.inference(data)
-            body = pickle.dumps(result, protocol=4)
-            return Response(body, mimetype="application/octet-stream")
+            try:
+                data = pickle.loads(request.data)
+                result = self.inference(data)
+                body = pickle.dumps(result, protocol=4)
+                return Response(body, mimetype="application/octet-stream")
+            except Exception as e:
+                err_obj = {
+                    "error": str(e),
+                    "traceback": traceback.format_exc(),
+                }
+                body = pickle.dumps(err_obj, protocol=4)
+                return Response(
+                    body,
+                    mimetype="application/octet-stream",
+                    status=500,
+                )
 
     def run(self):
 
@@ -168,6 +151,15 @@ class StarvlaInferenceServer:
 
 
 if __name__ == "__main__":
-    config_path = "./benchmark.yaml"
+    parser = argparse.ArgumentParser(description="StarVLA inference server")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="./benchmark.yaml",
+        help="Path to benchmark.yaml",
+    )
+    args = parser.parse_args()
+
+    config_path = args.config
     server = StarvlaInferenceServer(config_path)
     server.run()
\ No newline at end of file
diff --git a/starvla_policy.py b/starvla_policy.py
index b6010a6..a0eafc9 100644
--- a/starvla_policy.py
+++ b/starvla_policy.py
@@ -1,6 +1,9 @@
+import pickle
+
 from joysim.annotations.config_class import configclass, field
 from joysim.annotations.stereotype import stereotype
 from joysim.app import JoySim
+from joysim.controllers.motion_plan_controller import MotionPlanController
 from joysim.core.scene_manager import SceneManager
 from joysim.extensions.benchmark.action import RobotAction
 from joysim.extensions.benchmark.benchmark import (
@@ -9,19 +12,21 @@ from joysim.extensions.benchmark.benchmark import (
     ControlMode,
 )
 from joysim.extensions.benchmark.policy import Policy, PolicyConfig
-
+from joysim.utils.log import Log
+from joysim.utils.pose import Pose
 import numpy as np
-import pickle
 import requests
 
-
 @configclass
 @stereotype.register_config("starvla")
 class StarvlaPolicyConfig(PolicyConfig):
 
     robot_name: str = field(default="my_robot", required=True, comment="The name of the robot")
-    object_name: str = field(default="target", required=True, comment="The name of the object")
-
+    sensor_names: list[str] = field(
+        default=["Hand_Camera", "Left_Camera", "Right_Camera"],
+        required=True,
+        comment="The names of the sensors"
+    )
     server_url: str = field(
         default="http://127.0.0.1:5000/policy",
         required=True,
@@ -34,7 +39,7 @@ class StarvlaPolicyConfig(PolicyConfig):
         comment="task instruction"
     )
 
-
+    
 @stereotype.register_model("starvla")
 class StarvlaPolicy(Policy):
 
@@ -42,70 +47,76 @@ class StarvlaPolicy(Policy):
         super().__init__(config)
 
         self.robot_name = config.robot_name
-        self.object_name = config.object_name
+        self.sensor_names = config.sensor_names
         self.server_url = config.server_url
         self.prompt = config.prompt
 
     def reset(self) -> None:
-        pass
+        self.current_ee_position_state = None
+        self.current_ee_euler_xyz_state = None
+        self.current_gripper_state = None
 
     def warmup(self, benchmark_observation: BenchmarkObservation) -> None:
         pass
-
     def needs_observation(self) -> bool:
         return True
 
+    def _handle_server_error(self, response: requests.Response) -> None:
+        if response.status_code == 500:
+            err_obj = pickle.loads(response.content)
+            Log.error(f"StarVLA server error: {err_obj['error']}")
+            Log.error(f"Traceback: {err_obj['traceback']}")
+            exit(0)
+        elif response.status_code != 200:
+            Log.error(f"StarVLA server error with status code <{response.status_code}> : {response.text}")
+            exit(0)
+
     def preprocess_observation(self, benchmark_observation: BenchmarkObservation) -> dict:
-
         robot_obs = benchmark_observation.get_robot_observations(self.robot_name)["robot_data"]
-        joint_positions = robot_obs["joint_positions"]
-        robot_position = robot_obs["position"]
-        robot_quaternion = robot_obs["rotation"]
-
-        state = np.concatenate([
-            robot_position,
-            robot_quaternion,
-            np.array([0.0])
-        ])
-
-        camera_obs = benchmark_observation.get_sensor_observations()
-        rgb = camera_obs["rgb"]
-
-        obs = {
-            "state": np.expand_dims(state, axis=0),
-            "joint": np.expand_dims(joint_positions, axis=0),
-            "rgb": np.expand_dims(rgb, axis=0),
-            "prompt": self.prompt
-        }
+        ee_pose_base = robot_obs["ee_pose_base"]
+        ee_position, ee_euler_xyz = ee_pose_base["position"],ee_pose_base["euler_xyz"]
+        gripper = 1.0 if robot_obs["gripper_state"]["opened"] else 0.0
+        state = np.concatenate([ee_position,ee_euler_xyz,np.array([gripper])])
+        self.current_ee_position_state = np.array(ee_position).astype(np.float64)
+        self.current_ee_euler_xyz_state = np.array(ee_euler_xyz).astype(np.float64)
+        self.current_gripper_state = np.array([gripper])
+        rgb_data = {}
+        for sensor_name in self.sensor_names:
+            sensor_obs = benchmark_observation.get_sensor_observations(sensor_name)
+            rgb_data[sensor_name] = sensor_obs["rgb"].data.cpu().numpy().astype(np.uint8)
+        obs = {"state": state,"rgb": rgb_data,"prompt": self.prompt}
 
         return obs
 
     def compute_action(self, observation: dict) -> dict:
-
         payload = pickle.dumps(observation)
-
         response = requests.post(
             self.server_url,
             data=payload,
             headers={"Content-Type": "application/octet-stream"}
         )
-
-        if response.status_code != 200:
-            raise RuntimeError(f"StarVLA server error: {response.text}")
-
+        self._handle_server_error(response)
         result = pickle.loads(response.content)
-
         return result
 
     def postprocess_action(self, action: dict) -> BenchmarkAction:
-
         benchmark_action = BenchmarkAction()
 
-        robot = SceneManager.get_robot(self.robot_name)
-        joint_names = robot.get_planner().get_plannable_joint_names()
-
-        joint_positions = action["action"][0]
+        # get base frame end-effector pose # TODO: Make sure add or multiply the current state
+        ee_position = action["ee_delta_position_chunks"][0] + self.current_ee_position_state
+        ee_euler_xyz = action["ee_delta_euler_xyz_chunks"][0] + self.current_ee_euler_xyz_state
 
+        ee_pose = Pose(position=ee_position, euler_xyz=ee_euler_xyz)
+        ik_result = MotionPlanController.solve_ik(
+            robot_name=self.robot_name,
+            base_frame_ee_pose=ee_pose,
+        ).unwrap()
+        if not ik_result["success"]:
+            Log.error(f"IK failed: {ik_result['status']}. Ignore this action.")
+            return benchmark_action
+        
+        joint_names = ik_result["result"]["plannable_joint_names"]
+        joint_positions = ik_result["result"]["plannable_joint_positions"][0]
         benchmark_action.add_robot_action(
             RobotAction(
                 control_mode=ControlMode.POSITION,
@@ -119,5 +130,5 @@ class StarvlaPolicy(Policy):
 
 
 if __name__ == "__main__":
-    js = JoySim("./benchmark.yaml")
+    js = JoySim("/home/ubuntu/projects/benchmark/benchmark.yaml")
     js.start()
\ No newline at end of file