1313
1414import cv2
1515import habitat
16+ import imageio
1617import numpy as np
1718import quaternion
1819import torch
@@ -105,6 +106,10 @@ def __init__(self, cfg: EvalCfg):
105106
106107 # ------------------------------------- model ------------------------------------------
107108 self .model_args = argparse .Namespace (** cfg .agent .model_settings )
109+ self .vis_debug = bool (getattr (self .model_args , "vis_debug" , False ))
110+ self .vis_debug_path = getattr (
111+ self .model_args , "vis_debug_path" , os .path .join (self .output_path , "vis_debug" )
112+ )
108113
109114 processor = AutoProcessor .from_pretrained (self .model_args .model_path )
110115 processor .tokenizer .padding_side = 'left'
@@ -288,9 +293,17 @@ def _run_eval_dual_system(self) -> tuple:
288293
289294 vis_frames = []
290295 step_id = 0
296+ vis_writer = None
291297
292298 if self .save_video :
293299 os .makedirs (os .path .join (self .output_path , f'vis_{ self .epoch } ' , f'{ scene_id } ' ), exist_ok = True )
300+ if self .vis_debug :
301+ debug_dir = os .path .join (self .vis_debug_path , f'epoch_{ self .epoch } ' )
302+ os .makedirs (debug_dir , exist_ok = True )
303+ vis_writer = imageio .get_writer (
304+ os .path .join (debug_dir , f'{ scene_id } _{ episode_id :04d} .mp4' ),
305+ fps = 5 ,
306+ )
294307
295308 rgb_list = []
296309 action_seq = []
@@ -307,6 +320,7 @@ def _run_eval_dual_system(self) -> tuple:
307320
308321 # ---------- 2. Episode step loop -----------
309322 while (not done ) and (step_id <= self .max_steps_per_episode ):
323+ draw_pixel_goal = False
310324 # refactor agent get action
311325 rgb = observations ["rgb" ]
312326 depth = observations ["depth" ]
@@ -422,6 +436,7 @@ def _run_eval_dual_system(self) -> tuple:
422436 coord = [int (c ) for c in re .findall (r'\d+' , llm_outputs )]
423437
424438 pixel_goal = [int (coord [1 ]), int (coord [0 ])]
439+ draw_pixel_goal = True
425440
426441 # look down --> horizontal
427442 self .env .step (action_code .LOOKUP )
@@ -526,6 +541,24 @@ def _run_eval_dual_system(self) -> tuple:
526541
527542 print ("step_id" , step_id , "action" , action )
528543
544+ if vis_writer is not None :
545+ vis = np .asarray (save_raw_image ).copy ()
546+ vis = cv2 .putText (
547+ vis ,
548+ f"step { step_id } action { int (action )} " ,
549+ (20 , 40 ),
550+ cv2 .FONT_HERSHEY_SIMPLEX ,
551+ 1 ,
552+ (0 , 255 , 0 ),
553+ 2 ,
554+ )
555+ if pixel_goal is not None :
556+ if draw_pixel_goal :
557+ cv2 .circle (
558+ vis , (pixel_goal [0 ], pixel_goal [1 ]), radius = 8 , color = (255 , 0 , 0 ), thickness = - 1
559+ )
560+ vis_writer .append_data (vis )
561+
529562 if action == action_code .LOOKDOWN :
530563 self .env .step (action )
531564 observations , _ , done , _ = self .env .step (action )
@@ -586,6 +619,8 @@ def _run_eval_dual_system(self) -> tuple:
586619 quality = 9 ,
587620 )
588621 vis_frames .clear ()
622+ if vis_writer is not None :
623+ vis_writer .close ()
589624
590625 self .env .close ()
591626
@@ -643,9 +678,17 @@ def _run_eval_system2(self) -> tuple:
643678
644679 vis_frames = []
645680 step_id = 0
681+ vis_writer = None
646682
647683 if self .save_video :
648684 os .makedirs (os .path .join (self .output_path , f'vis_{ self .epoch } ' , f'{ scene_id } ' ), exist_ok = True )
685+ if self .vis_debug :
686+ debug_dir = os .path .join (self .vis_debug_path , f'epoch_{ self .epoch } ' )
687+ os .makedirs (debug_dir , exist_ok = True )
688+ vis_writer = imageio .get_writer (
689+ os .path .join (debug_dir , f'{ scene_id } _{ episode_id :04d} .mp4' ),
690+ fps = 5 ,
691+ )
649692 initial_height = self .env ._env .sim .get_agent_state ().position [1 ]
650693
651694 rgb_list = []
@@ -662,6 +705,7 @@ def _run_eval_system2(self) -> tuple:
662705
663706 # ---------- 2. Episode step loop -----------
664707 while (not done ) and (step_id <= self .max_steps_per_episode ):
708+ draw_pixel_goal = False
665709 # refactor agent get action
666710 rgb = observations ["rgb" ]
667711 depth = observations ["depth" ]
@@ -755,6 +799,7 @@ def _run_eval_system2(self) -> tuple:
755799 coord = [int (c ) for c in re .findall (r'\d+' , llm_outputs )]
756800
757801 pixel_goal = [int (coord [1 ]), int (coord [0 ])]
802+ draw_pixel_goal = True
758803
759804 # look down --> horizontal
760805 self .env .step (action_code .LOOKUP )
@@ -818,6 +863,21 @@ def _run_eval_system2(self) -> tuple:
818863
819864 print ("step_id" , step_id , "action" , action )
820865
866+ if vis_writer is not None :
867+ vis = np .asarray (save_raw_image ).copy ()
868+ vis = cv2 .putText (
869+ vis ,
870+ f"step { step_id } action { int (action )} " ,
871+ (20 , 40 ),
872+ cv2 .FONT_HERSHEY_SIMPLEX ,
873+ 1 ,
874+ (0 , 255 , 0 ),
875+ 2 ,
876+ )
877+ if draw_pixel_goal :
878+ cv2 .circle (vis , (pixel_goal [0 ], pixel_goal [1 ]), radius = 8 , color = (255 , 0 , 0 ), thickness = - 1 )
879+ vis_writer .append_data (vis )
880+
821881 if action == action_code .LOOKDOWN :
822882 self .env .step (action )
823883 observations , _ , done , _ = self .env .step (action )
@@ -875,6 +935,8 @@ def _run_eval_system2(self) -> tuple:
875935 quality = 9 ,
876936 )
877937 vis_frames .clear ()
938+ if vis_writer is not None :
939+ vis_writer .close ()
878940
879941 self .env .close ()
880942
0 commit comments