(TaskRunner pid=2165352) train_sampling_params: {'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'logprobs': 1}
(TaskRunner pid=2165352) val_sampling_params: {'temperature': 0, 'top_k': -1, 'top_p': 1.0, 'logprobs': 1}
(TaskRunner pid=2165352) Checkpoint tracker file does not exist: /home/rllm/checkpoints/deepscaler-agent/swe-agent-rl/latest_checkpointed_iteration.txt
(TaskRunner pid=2165352) Training from scratch
(TaskRunner pid=2165352) Time taken to validate agent: 3.9577484130859375e-05
(TaskRunner pid=2165352) 'epoch 0, step 1 started'
Error executing job with overrides: ['algorithm.adv_estimator=rloo', 'data.train_files=/home/rllm/data/swe/R2E_Gym_Subset.parquet', 'data.val_files=/home/rllm/data/swe/SWE_Bench_Verified.parquet', 'data.train_batch_size=2', 'data.val_batch_size=128', 'data.max_prompt_length=4096', 'data.max_response_length=32768', 'data.filter_overlong_prompts=True', 'data.filter_overlong_prompts_workers=32', 'actor_rollout_ref.model.path=Qwen/Qwen3-32B', 'actor_rollout_ref.hybrid_engine=True', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.model.use_remove_padding=True', 'actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum', 'actor_rollout_ref.actor.ppo_mini_batch_size=2', 'actor_rollout_ref.actor.use_dynamic_bsz=False', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1', 'actor_rollout_ref.actor.ppo_max_token_len_per_gpu=16000', 'actor_rollout_ref.actor.use_kl_loss=False', 'actor_rollout_ref.actor.clip_ratio_high=0.28', 'actor_rollout_ref.actor.kl_loss_coef=0.001', 'actor_rollout_ref.actor.kl_loss_type=low_var_kl', 'actor_rollout_ref.actor.ulysses_sequence_parallel_size=1', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', 'actor_rollout_ref.actor.fsdp_config.param_offload=True', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=True', 'actor_rollout_ref.rollout.tensor_model_parallel_size=4', 'actor_rollout_ref.rollout.name=vllm', 'actor_rollout_ref.rollout.mode=async', 'actor_rollout_ref.rollout.enforce_eager=False', 'actor_rollout_ref.rollout.temperature=1.0', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'actor_rollout_ref.rollout.max_model_len=30000', 'actor_rollout_ref.rollout.max_num_seqs=64', 'actor_rollout_ref.rollout.n=4', 'actor_rollout_ref.rollout.val_kwargs.n=1', 'actor_rollout_ref.rollout.val_kwargs.temperature=0', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'actor_rollout_ref.actor.entropy_coeff=0.0', 'algorithm.kl_ctrl.kl_coef=0.001', 'rllm.mask_truncated_samples=False', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=deepscaler-agent', 'trainer.experiment_name=swe-agent-rl', 'trainer.val_before_train=False', 'trainer.n_gpus_per_node=4', 'trainer.nnodes=1', 'trainer.save_freq=10', 'trainer.test_freq=10', 'trainer.default_hdfs_dir=null', 'rllm.env.name=swe', 'rllm.agent.name=sweagent', 'rllm.agent.max_steps=50', 'rllm.agent.overlong_filter=True', 'rllm.agent.trajectory_timeout=5400', 'trainer.total_epochs=1000']
Traceback (most recent call last):
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 26, in main
run_ppo_agent(config)
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 46, in run_ppo_agent
ray.get(runner.run.remote(config))
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): ray::TaskRunner.run() (pid=2165352, ip=129.97.152.19, actor_id=1cf5fc28187cc3fcc383855301000000, repr=<train_agent_ppo.TaskRunner object at 0x7f9dbfc6e090>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 221, in run
trainer.fit_agent()
File "/home/rllm/rllm/trainer/verl/agent_ppo_trainer.py", line 160, in fit_agent
for batch_dict in self.train_dataloader:
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 450, in next
return super().next()
^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 734, in next
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 1456, in _next_data
return self._process_data(data, worker_id, state_dict)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 1543, in _process_data
data.reraise()
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/_utils.py", line 769, in reraise
raise exception
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/worker.py", line 242, in _worker_loop
data = fetcher.fetch(index) # type: ignore[union-attr]
^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in
data = [self.dataset[idx] for idx in possibly_batched_index]
~~~~~~~~~~~~^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/verl/utils/dataset/rl_dataset.py", line 444, in getitem
index = row_dict.get("extra_info", {}).get("index", 0)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'str' object has no attribute 'get'
Hi I got this error when tring to run train_deepswe_32b.sh. Is there suggestion on the debugs? Thanks
(TaskRunner pid=2165352) train_sampling_params: {'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'logprobs': 1}
(TaskRunner pid=2165352) val_sampling_params: {'temperature': 0, 'top_k': -1, 'top_p': 1.0, 'logprobs': 1}
(TaskRunner pid=2165352) Checkpoint tracker file does not exist: /home/rllm/checkpoints/deepscaler-agent/swe-agent-rl/latest_checkpointed_iteration.txt
(TaskRunner pid=2165352) Training from scratch
(TaskRunner pid=2165352) Time taken to validate agent: 3.9577484130859375e-05
(TaskRunner pid=2165352) 'epoch 0, step 1 started'
Error executing job with overrides: ['algorithm.adv_estimator=rloo', 'data.train_files=/home/rllm/data/swe/R2E_Gym_Subset.parquet', 'data.val_files=/home/rllm/data/swe/SWE_Bench_Verified.parquet', 'data.train_batch_size=2', 'data.val_batch_size=128', 'data.max_prompt_length=4096', 'data.max_response_length=32768', 'data.filter_overlong_prompts=True', 'data.filter_overlong_prompts_workers=32', 'actor_rollout_ref.model.path=Qwen/Qwen3-32B', 'actor_rollout_ref.hybrid_engine=True', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.model.use_remove_padding=True', 'actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum', 'actor_rollout_ref.actor.ppo_mini_batch_size=2', 'actor_rollout_ref.actor.use_dynamic_bsz=False', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1', 'actor_rollout_ref.actor.ppo_max_token_len_per_gpu=16000', 'actor_rollout_ref.actor.use_kl_loss=False', 'actor_rollout_ref.actor.clip_ratio_high=0.28', 'actor_rollout_ref.actor.kl_loss_coef=0.001', 'actor_rollout_ref.actor.kl_loss_type=low_var_kl', 'actor_rollout_ref.actor.ulysses_sequence_parallel_size=1', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', 'actor_rollout_ref.actor.fsdp_config.param_offload=True', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=True', 'actor_rollout_ref.rollout.tensor_model_parallel_size=4', 'actor_rollout_ref.rollout.name=vllm', 'actor_rollout_ref.rollout.mode=async', 'actor_rollout_ref.rollout.enforce_eager=False', 'actor_rollout_ref.rollout.temperature=1.0', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'actor_rollout_ref.rollout.max_model_len=30000', 'actor_rollout_ref.rollout.max_num_seqs=64', 'actor_rollout_ref.rollout.n=4', 'actor_rollout_ref.rollout.val_kwargs.n=1', 'actor_rollout_ref.rollout.val_kwargs.temperature=0', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'actor_rollout_ref.actor.entropy_coeff=0.0', 'algorithm.kl_ctrl.kl_coef=0.001', 'rllm.mask_truncated_samples=False', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=deepscaler-agent', 'trainer.experiment_name=swe-agent-rl', 'trainer.val_before_train=False', 'trainer.n_gpus_per_node=4', 'trainer.nnodes=1', 'trainer.save_freq=10', 'trainer.test_freq=10', 'trainer.default_hdfs_dir=null', 'rllm.env.name=swe', 'rllm.agent.name=sweagent', 'rllm.agent.max_steps=50', 'rllm.agent.overlong_filter=True', 'rllm.agent.trajectory_timeout=5400', 'trainer.total_epochs=1000']
Traceback (most recent call last):
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 26, in main
run_ppo_agent(config)
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 46, in run_ppo_agent
ray.get(runner.run.remote(config))
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): ray::TaskRunner.run() (pid=2165352, ip=129.97.152.19, actor_id=1cf5fc28187cc3fcc383855301000000, repr=<train_agent_ppo.TaskRunner object at 0x7f9dbfc6e090>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 221, in run
trainer.fit_agent()
File "/home/rllm/rllm/trainer/verl/agent_ppo_trainer.py", line 160, in fit_agent
for batch_dict in self.train_dataloader:
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 450, in next
return super().next()
^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 734, in next
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 1456, in _next_data
return self._process_data(data, worker_id, state_dict)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 1543, in _process_data
data.reraise()
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/_utils.py", line 769, in reraise
raise exception
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/worker.py", line 242, in _worker_loop
data = fetcher.fetch(index) # type: ignore[union-attr]
^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in
data = [self.dataset[idx] for idx in possibly_batched_index]
~~~~~~~~~~~~^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/verl/utils/dataset/rl_dataset.py", line 444, in getitem
index = row_dict.get("extra_info", {}).get("index", 0)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'str' object has no attribute 'get'
Hi I got this error when tring to run train_deepswe_32b.sh. Is there suggestion on the debugs? Thanks