Fix handling batched vector annotation of tuple to np.array (#264)

youliangtan · web-flow · commit f4e12be5458f · 2025-07-15T12:03:41.000-07:00
* Fix handling batched vector annotation from tuple to np.array

Signed-off-by: youliangt &lt;youliangt@nvidia.com&gt;

* nit and readme

Signed-off-by: youliangt &lt;youliangt@nvidia.com&gt;

---------

Signed-off-by: youliangt &lt;youliangt@nvidia.com&gt;
diff --git a/README.md b/README.md
@@ -222,15 +222,6 @@ python scripts/gr00t_finetune.py --dataset-path ./demo_data/robot_sim.PickNPlace
 
 **Note**: If you are finetuning on a 4090, you need to pass the `--no-tune_diffusion_model` flag when running `gr00t_finetune.py` to avoid CUDA out of memory.
 
-You can also download a sample dataset from our huggingface sim data release [here](https://huggingface.co/datasets/nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim)
-
-```
-huggingface-cli download  nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim \
-  --repo-type dataset \
-  --include "gr1_arms_only.CanSort/**" \
-  --local-dir $HOME/gr00t_dataset
-```
-
 The recommended finetuning configuration is to boost your batch size to the max, and train for 20k steps.
 
 *Hardware Performance Considerations*
@@ -255,6 +246,14 @@ GR00T N1.5 provides three pretrained embodiment heads optimized for different ro
 
 Select the embodiment head that best matches your robot's configuration for optimal finetuning performance. For detailed information on the observation and action spaces, see [`EmbodimentTag`](getting_started/4_deeper_understanding.md#embodiment-action-head-fine-tuning).
 
+
+### Sim Env: [robocasa-gr1-tabletop-tasks](https://github.com/robocasa/robocasa-gr1-tabletop-tasks)
+
+Sample dataset for finetuning can be downloaed from our huggingface [here](https://huggingface.co/datasets/nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim)
+
+For Simulation Evaluation, please refer to [robocasa-gr1-tabletop-tasks](https://github.com/robocasa/robocasa-gr1-tabletop-tasks)
+
+
 ## 4. Evaluation
 
 To conduct an offline evaluation of the model, we provide a script that evaluates the model on a dataset and plots it out. Quick try: `python scripts/eval_policy.py --plot --model_path nvidia/GR00T-N1.5-3B`
diff --git a/demo_data/robot_sim.PickNPlace/meta/modality.json b/demo_data/robot_sim.PickNPlace/meta/modality.json
@@ -74,6 +74,9 @@
     },
     "annotation": {
         "human.action.task_description": {},
-        "human.validity": {}
+        "human.validity": {},
+        "human.coarse_action": {
+            "original_key": "annotation.human.action.task_description"
+        }
     }
 }
diff --git a/gr00t/model/policy.py b/gr00t/model/policy.py
@@ -152,12 +152,14 @@ def get_action(self, observations: Dict[str, Any]) -> Dict[str, Any]:
         e.g. obs = {
             "video.<>": np.ndarray,  # (T, H, W, C)
             "state.<>": np.ndarray, # (T, D)
+            "annotation.<>": np.ndarray, # (T, )
         }
 
         or with batched input:
         e.g. obs = {
             "video.<>": np.ndarray,, # (B, T, H, W, C)
             "state.<>": np.ndarray, # (B, T, D)
+            "annotation.<>": np.ndarray, # (B, T, )
         }
 
         Returns:
@@ -167,6 +169,12 @@ def get_action(self, observations: Dict[str, Any]) -> Dict[str, Any]:
         is_batch = self._check_state_is_batched(observations)
         if not is_batch:
             observations = unsqueeze_dict_values(observations)
+
+        # NOTE(YL): ensure keys are all in numpy array
+        for k, v in observations.items():
+            if not isinstance(v, np.ndarray):
+                observations[k] = np.array(v)
+
         # Apply transforms
         normalized_input = self.apply_transforms(observations)
 
diff --git a/scripts/simulation_service.py b/scripts/simulation_service.py
@@ -50,9 +50,7 @@
     parser.add_argument(
         "--host", type=str, help="Host address for the server.", default="localhost"
     )
-    parser.add_argument(
-        "--video_dir", type=str, help="Directory to save videos.", default="./videos"
-    )
+    parser.add_argument("--video_dir", type=str, help="Directory to save videos.", default=None)
     parser.add_argument("--n_episodes", type=int, help="Number of episodes to run.", default=2)
     parser.add_argument("--n_envs", type=int, help="Number of parallel environments.", default=1)
     parser.add_argument(

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,9 @@`
`74`	`74`	`},`
`75`	`75`	`"annotation": {`
`76`	`76`	`"human.action.task_description": {},`
`77`		`- "human.validity": {}`
	`77`	`+ "human.validity": {},`
	`78`	`+ "human.coarse_action": {`
	`79`	`+ "original_key": "annotation.human.action.task_description"`
	`80`	`+ }`
`78`	`81`	`}`
`79`	`82`	`}`