From 1b30c3f96a5e7bc043870008fd1e086be4b8acc9 Mon Sep 17 00:00:00 2001 From: "hufei.hofee" Date: Thu, 19 Mar 2026 01:43:02 +0800 Subject: [PATCH] finish benchmark debug --- .gitignore | 3 +- Readme.md | 28 +++++ benchmark.yaml | 234 +++++++++++++++++++++--------------- starvla_inference_server.py | 86 ++++++------- starvla_policy.py | 95 ++++++++------- 5 files changed, 257 insertions(+), 189 deletions(-) create mode 100644 Readme.md diff --git a/.gitignore b/.gitignore index 600d2d3..83c4619 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.vscode \ No newline at end of file +.vscode +__pycache__/ \ No newline at end of file diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..ab0607a --- /dev/null +++ b/Readme.md @@ -0,0 +1,28 @@ +# StarVLA Benchmark + +## 1 Installation + +### 1.1 Install Fastsim + +Install Fastsim from: [https://git.hofee.cn/hofee/fastsim.git](https://git.hofee.cn/hofee/fastsim.git) + +### 1.2 Install StarVLA + +Install edited version of StarVLA from `XiongHao Wu` + +## 2 Run Benchmark + +### 2.1 Run FastSim + +```bash +conda activate fastsim +joysim launch_simulation --config ./benchmark.yaml # policy will be auto-scanned. +``` + +### 2.2 Run StarVLA inference server + +```bash +conda activate starvla +python starvla_inference_server.py --config ./benchmark.yaml +``` + diff --git a/benchmark.yaml b/benchmark.yaml index 8affca8..4ab18a7 100644 --- a/benchmark.yaml +++ b/benchmark.yaml @@ -2,7 +2,6 @@ general: scan_project: true root_paths: asset: /home/ubuntu/projects/gen_data/data - output: /home/ubuntu/output checkpoints: /home/ubuntu/data/models simulation: @@ -13,160 +12,197 @@ simulation: livestream: 0 scene: - name: default_scene_name - position: [0, 0, 0] - rotation: [1, 0, 0, 0] + name: kujiale_multispace base_config: - name: default_base - source: primitive - stereotype: ground_plane - ground_size: [100,100] - + stereotype: usd + name: _827313_home_workspace_00 + source: local + asset_path: asset://scenes/kujiale_multispace/827313_home/workspace_00.usd object_cfg_dict: - table: - name: simple_table - position: [0.5, 0, 0.25] - source: primitive - stereotype: rigid - primitive_type: cuboid - primitive_size: [0.5, 1, 0.5] - mass: 1e4 - - target: - name: target - position: [0.4, 0.0, 0.5] - scale: [0.001, 0.001, 0.001] - axis_y_up: true - asset_path: asset://objects/omni6DPose/ball/omni6DPose_ball_020/Aligned.usd + omni6DPose_can_016: + name: omni6DPose_can_016 stereotype: rigid source: local - - + asset_path: asset://objects/omni6DPose/can/omni6DPose_can_016/Aligned.usd + scale: + - 0.001 + - 0.001 + - 0.001 + position: + - 0.0 + - -3.79243 + - 0.5 + quaternion: + - -0.304408012043137 + - -0.304408012043137 + - 0.638228612805745 + - 0.6382286128057448 + axis_y_up: true + omni6DPose_plug_001: + name: omni6DPose_plug_001 + stereotype: rigid + source: local + asset_path: asset://objects/omni6DPose/plug/omni6DPose_plug_001/Aligned.usd + scale: + - 0.001 + - 0.001 + - 0.001 + position: + - 0.419859 + - -4.152430000000001 + - 0.510259093 + quaternion: + - 0.7070997233636068 + - 0.707099723363607 + - 0.003159306745230588 + - 0.003159306745230588 + axis_y_up: true robot_cfg_dict: - robot: - name: my_robot + Franka: + name: Franka asset_path: asset://Franka/franka_robotiq_2f85_zedmini.usd - position: [0, 0, 0] + position: + - -0.3779152929316859 + - -3.943187336951741 + - 0.3130090550954169 + rotation: + - 0.9987576855319703 + - 0.0 + - 0.0 + - 0.04983056883903615 stereotype: single_gripper_arm_robot source: local init_joint_position: panda_joint2: -0.1633 - panda_joint4: -1.070 + panda_joint4: -1.07 panda_joint6: 0.8933 panda_joint7: 0.785 - arm_actuator_name: franka_arm gripper_actuator_name: robotiq_2f_85 - use_planner: true planner_cfg: stereotype: curobo lazy_init: true robot_config_file: asset://curobo/franka_robotiq_2f85/franka_robotiq_2f85.yml world_config_source: stage - world_stage_ignore_substrings: [my_robot] - world_stage_only_paths: [/World] - world_stage_reference_prim_path: /World/Robot/SingleGripperArmRobot/my_robot - sensor_cfg_dict: - front_camera: - name: front_camera + Hand_Camera: + name: Hand_Camera stereotype: camera - position: [0.8, 0.0, 0.8] - data_types: [rgb, depth, normals] + data_types: + - rgb + - depth + - normals width: 1280 height: 720 camera_model: pinhole fix_camera: true - - left_camera: - name: left_camera + focal_length: 2.8 + horizontal_aperture: 4.890881131191918 + vertical_aperture: 2.7608816125932627 + convention: opengl + attach_to: + target_name: Franka + is_articulation_part: true + articulation_part_name: panda_link8 + create_fixed_joint: true + local_position: + - -0.07176474936469446 + - 0.02890958100382394 + - 0.01978286477078585 + local_rotation: + - 0.12352531576657892 + - 0.7000519980574638 + - -0.6933396337721066 + - -0.11810524383481495 + Left_Camera: + name: Left_Camera stereotype: camera - position: [0.6, 0.7, 0.8] - data_types: [rgb, depth, normals] + data_types: + - rgb + - depth + - normals width: 1280 height: 720 camera_model: pinhole - fix_camera: true - - right_camera: - name: right_camera + fix_camera: false + focal_length: 2.1 + horizontal_aperture: 5.019302546405283 + vertical_aperture: 2.833796298140747 + convention: opengl + attach_to: + target_name: Franka + local_position: + - -0.07383269512283744 + - -0.4566116797983716 + - 0.5664518136443555 + local_rotation: + - 0.7669289562134721 + - 0.4437472984144337 + - -0.232167881174614 + - -0.4012560108236337 + Right_Camera: + name: Right_Camera stereotype: camera - position: [0.6, -0.7, 0.8] - data_types: [rgb, depth, normals] + data_types: + - rgb + - depth + - normals width: 1280 height: 720 camera_model: pinhole - fix_camera: true + fix_camera: false + focal_length: 2.1 + horizontal_aperture: 5.053278483887542 + vertical_aperture: 2.833796298140747 + convention: opengl + attach_to: + target_name: Franka + local_position: + - 0.4501524009531093 + - 0.7206899873248545 + - 0.27525652672526973 + local_rotation: + - 0.007056820167439314 + - 0.007107689842383092 + - 0.709606075042274 + - 0.7045274304789896 extension: extension_cfg_dict: - my_data_collect: + benchmark_data_collect: enable: true stereotype: data_collect observer_cfgs: - stereotype: robot_observer - name: my_robot - observe_joint_positions: true - observe_joint_velocities: true - observe_joint_accelerations: true - observe_joint_position_targets: true - observe_joint_velocity_targets: true - observe_position: true - observe_rotation: true + name: Franka observe_ee_pose: true observe_gripper_state: true observe_gripper_drive_state: true - stereotype: sensor_observer - name: front_camera - observe_intrinsic_matrix: true - observe_extrinsic_matrix: true + name: Hand_Camera observe_rgb: true - observe_depth: true - observe_normals: true - stereotype: sensor_observer - name: left_camera - observe_intrinsic_matrix: true - observe_extrinsic_matrix: true + name: Left_Camera observe_rgb: true - observe_depth: true - observe_normals: true - stereotype: sensor_observer - name: right_camera - observe_intrinsic_matrix: true - observe_extrinsic_matrix: true + name: Right_Camera observe_rgb: true - observe_depth: true - observe_normals: true - - - stereotype: task_observer - name: task - - - stereotype: object_observer - name: target - observe_position: true - observe_rotation: true - observe_scale: true - my_benchmark: + starvla_benchmark: enable: true stereotype: benchmark - data_collector_name: my_data_collect + data_collector_name: benchmark_data_collect goals: - - name: reach_target - description: Reach the target - stereotype: pose - pose_A_source: ee - pose_A_params: - robot_name: my_robot - pose_B_source: spawnable - pose_B_params: - spawnable_name: target - position_tolerance: 0.005 + - name: cola on top of book + description: check if the cola bottle is on the book + stereotype: on_top + object_A_name: omni6DPose_plug_001 + object_B_name: omni6DPose_can_016 policy: stereotype: starvla - robot_name: my_robot - object_name: target + robot_name: Franka + sensor_names: [Hand_Camera, Left_Camera, Right_Camera] prompt: pick the cola bottle and place it on the book policy_server: diff --git a/starvla_inference_server.py b/starvla_inference_server.py index d489bfa..f4f674b 100644 --- a/starvla_inference_server.py +++ b/starvla_inference_server.py @@ -1,7 +1,9 @@ import yaml import pickle +import argparse import os +import traceback from urllib.parse import urlparse import numpy as np import torch @@ -77,49 +79,17 @@ class StarvlaInferenceServer: return model - def parse_observation(self, obs): + def parse_observation(self, obs, target_size=(320, 180)): - rgb = obs["rgb"][-1] - state = obs["state"][-1] - joint = obs.get("joint", None) - prompt = obs["prompt"] + left_rgb, right_rgb, wrist_rgb = obs["rgb"]["Left_Camera"], obs["rgb"]["Right_Camera"], obs["rgb"]["Hand_Camera"] - left = rgb[:, :, :3] - right = rgb[:, :, 3:6] - wrist = rgb[:, :, 6:9] + img_left = Image.fromarray(cv2.resize(left_rgb, target_size)) + img_right = Image.fromarray(cv2.resize(right_rgb, target_size)) + img_wrist = Image.fromarray(cv2.resize(wrist_rgb, target_size)) - target_size = (320, 180) + state_vec = obs["state"] - left = cv2.resize(left, target_size) - right = cv2.resize(right, target_size) - wrist = cv2.resize(wrist, target_size) - - img_left = Image.fromarray(left) - img_right = Image.fromarray(right) - img_wrist = Image.fromarray(wrist) - - if self.state_mode == "joint8": - - joint_last = joint[-1] - gripper = state[9] - - state_vec = np.concatenate( - [joint_last, np.array([gripper])], - axis=0 - ) - - else: - - xyz = state[0:3] - rot6d = state[3:9] - gripper = state[9] - - state_vec = np.concatenate( - [xyz, rot6d[:3], np.array([gripper])], - axis=0 - ) - - return img_left, img_right, img_wrist, state_vec, prompt + return img_left, img_right, img_wrist, state_vec, obs["prompt"] def inference(self, observation: dict) -> dict: @@ -141,18 +111,31 @@ class StarvlaInferenceServer: actions = actions.cpu().numpy() if actions.ndim == 3: - actions = actions[0] - - return {"action": actions.astype(np.float32)} + actions = actions[0] # (8, 7) + return {"ee_delta_position_chunks": actions[:, :3].tolist(), + "ee_delta_euler_xyz_chunks": actions[:, 3:6].tolist(), + "gripper_chunks": actions[:, 6:7].tolist()} def register_routes(self): @self.app.route("/policy", methods=["POST"]) def policy(): - data = pickle.loads(request.data) - result = self.inference(data) - body = pickle.dumps(result, protocol=4) - return Response(body, mimetype="application/octet-stream") + try: + data = pickle.loads(request.data) + result = self.inference(data) + body = pickle.dumps(result, protocol=4) + return Response(body, mimetype="application/octet-stream") + except Exception as e: + err_obj = { + "error": str(e), + "traceback": traceback.format_exc(), + } + body = pickle.dumps(err_obj, protocol=4) + return Response( + body, + mimetype="application/octet-stream", + status=500, + ) def run(self): @@ -168,6 +151,15 @@ class StarvlaInferenceServer: if __name__ == "__main__": - config_path = "./benchmark.yaml" + parser = argparse.ArgumentParser(description="StarVLA inference server") + parser.add_argument( + "--config", + type=str, + default="./benchmark.yaml", + help="Path to benchmark.yaml", + ) + args = parser.parse_args() + + config_path = args.config server = StarvlaInferenceServer(config_path) server.run() \ No newline at end of file diff --git a/starvla_policy.py b/starvla_policy.py index b6010a6..a0eafc9 100644 --- a/starvla_policy.py +++ b/starvla_policy.py @@ -1,6 +1,9 @@ +import pickle + from joysim.annotations.config_class import configclass, field from joysim.annotations.stereotype import stereotype from joysim.app import JoySim +from joysim.controllers.motion_plan_controller import MotionPlanController from joysim.core.scene_manager import SceneManager from joysim.extensions.benchmark.action import RobotAction from joysim.extensions.benchmark.benchmark import ( @@ -9,19 +12,21 @@ from joysim.extensions.benchmark.benchmark import ( ControlMode, ) from joysim.extensions.benchmark.policy import Policy, PolicyConfig - +from joysim.utils.log import Log +from joysim.utils.pose import Pose import numpy as np -import pickle import requests - @configclass @stereotype.register_config("starvla") class StarvlaPolicyConfig(PolicyConfig): robot_name: str = field(default="my_robot", required=True, comment="The name of the robot") - object_name: str = field(default="target", required=True, comment="The name of the object") - + sensor_names: list[str] = field( + default=["Hand_Camera", "Left_Camera", "Right_Camera"], + required=True, + comment="The names of the sensors" + ) server_url: str = field( default="http://127.0.0.1:5000/policy", required=True, @@ -34,7 +39,7 @@ class StarvlaPolicyConfig(PolicyConfig): comment="task instruction" ) - + @stereotype.register_model("starvla") class StarvlaPolicy(Policy): @@ -42,70 +47,76 @@ class StarvlaPolicy(Policy): super().__init__(config) self.robot_name = config.robot_name - self.object_name = config.object_name + self.sensor_names = config.sensor_names self.server_url = config.server_url self.prompt = config.prompt def reset(self) -> None: - pass + self.current_ee_position_state = None + self.current_ee_euler_xyz_state = None + self.current_gripper_state = None def warmup(self, benchmark_observation: BenchmarkObservation) -> None: pass - def needs_observation(self) -> bool: return True + def _handle_server_error(self, response: requests.Response) -> None: + if response.status_code == 500: + err_obj = pickle.loads(response.content) + Log.error(f"StarVLA server error: {err_obj['error']}") + Log.error(f"Traceback: {err_obj['traceback']}") + exit(0) + elif response.status_code != 200: + Log.error(f"StarVLA server error with status code <{response.status_code}> : {response.text}") + exit(0) + def preprocess_observation(self, benchmark_observation: BenchmarkObservation) -> dict: - robot_obs = benchmark_observation.get_robot_observations(self.robot_name)["robot_data"] - joint_positions = robot_obs["joint_positions"] - robot_position = robot_obs["position"] - robot_quaternion = robot_obs["rotation"] - - state = np.concatenate([ - robot_position, - robot_quaternion, - np.array([0.0]) - ]) - - camera_obs = benchmark_observation.get_sensor_observations() - rgb = camera_obs["rgb"] - - obs = { - "state": np.expand_dims(state, axis=0), - "joint": np.expand_dims(joint_positions, axis=0), - "rgb": np.expand_dims(rgb, axis=0), - "prompt": self.prompt - } + ee_pose_base = robot_obs["ee_pose_base"] + ee_position, ee_euler_xyz = ee_pose_base["position"],ee_pose_base["euler_xyz"] + gripper = 1.0 if robot_obs["gripper_state"]["opened"] else 0.0 + state = np.concatenate([ee_position,ee_euler_xyz,np.array([gripper])]) + self.current_ee_position_state = np.array(ee_position).astype(np.float64) + self.current_ee_euler_xyz_state = np.array(ee_euler_xyz).astype(np.float64) + self.current_gripper_state = np.array([gripper]) + rgb_data = {} + for sensor_name in self.sensor_names: + sensor_obs = benchmark_observation.get_sensor_observations(sensor_name) + rgb_data[sensor_name] = sensor_obs["rgb"].data.cpu().numpy().astype(np.uint8) + obs = {"state": state,"rgb": rgb_data,"prompt": self.prompt} return obs def compute_action(self, observation: dict) -> dict: - payload = pickle.dumps(observation) - response = requests.post( self.server_url, data=payload, headers={"Content-Type": "application/octet-stream"} ) - - if response.status_code != 200: - raise RuntimeError(f"StarVLA server error: {response.text}") - + self._handle_server_error(response) result = pickle.loads(response.content) - return result def postprocess_action(self, action: dict) -> BenchmarkAction: - benchmark_action = BenchmarkAction() - robot = SceneManager.get_robot(self.robot_name) - joint_names = robot.get_planner().get_plannable_joint_names() - - joint_positions = action["action"][0] + # get base frame end-effector pose # TODO: Make sure add or multiply the current state + ee_position = action["ee_delta_position_chunks"][0] + self.current_ee_position_state + ee_euler_xyz = action["ee_delta_euler_xyz_chunks"][0] + self.current_ee_euler_xyz_state + ee_pose = Pose(position=ee_position, euler_xyz=ee_euler_xyz) + ik_result = MotionPlanController.solve_ik( + robot_name=self.robot_name, + base_frame_ee_pose=ee_pose, + ).unwrap() + if not ik_result["success"]: + Log.error(f"IK failed: {ik_result['status']}. Ignore this action.") + return benchmark_action + + joint_names = ik_result["result"]["plannable_joint_names"] + joint_positions = ik_result["result"]["plannable_joint_positions"][0] benchmark_action.add_robot_action( RobotAction( control_mode=ControlMode.POSITION, @@ -119,5 +130,5 @@ class StarvlaPolicy(Policy): if __name__ == "__main__": - js = JoySim("./benchmark.yaml") + js = JoySim("/home/ubuntu/projects/benchmark/benchmark.yaml") js.start() \ No newline at end of file