Skip to content

'str' object has no attribute 'get' #364

@xuyeliu

Description

@xuyeliu

(TaskRunner pid=2165352) train_sampling_params: {'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'logprobs': 1}
(TaskRunner pid=2165352) val_sampling_params: {'temperature': 0, 'top_k': -1, 'top_p': 1.0, 'logprobs': 1}
(TaskRunner pid=2165352) Checkpoint tracker file does not exist: /home/rllm/checkpoints/deepscaler-agent/swe-agent-rl/latest_checkpointed_iteration.txt
(TaskRunner pid=2165352) Training from scratch
(TaskRunner pid=2165352) Time taken to validate agent: 3.9577484130859375e-05
(TaskRunner pid=2165352) 'epoch 0, step 1 started'
Error executing job with overrides: ['algorithm.adv_estimator=rloo', 'data.train_files=/home/rllm/data/swe/R2E_Gym_Subset.parquet', 'data.val_files=/home/rllm/data/swe/SWE_Bench_Verified.parquet', 'data.train_batch_size=2', 'data.val_batch_size=128', 'data.max_prompt_length=4096', 'data.max_response_length=32768', 'data.filter_overlong_prompts=True', 'data.filter_overlong_prompts_workers=32', 'actor_rollout_ref.model.path=Qwen/Qwen3-32B', 'actor_rollout_ref.hybrid_engine=True', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.model.use_remove_padding=True', 'actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum', 'actor_rollout_ref.actor.ppo_mini_batch_size=2', 'actor_rollout_ref.actor.use_dynamic_bsz=False', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1', 'actor_rollout_ref.actor.ppo_max_token_len_per_gpu=16000', 'actor_rollout_ref.actor.use_kl_loss=False', 'actor_rollout_ref.actor.clip_ratio_high=0.28', 'actor_rollout_ref.actor.kl_loss_coef=0.001', 'actor_rollout_ref.actor.kl_loss_type=low_var_kl', 'actor_rollout_ref.actor.ulysses_sequence_parallel_size=1', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', 'actor_rollout_ref.actor.fsdp_config.param_offload=True', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=True', 'actor_rollout_ref.rollout.tensor_model_parallel_size=4', 'actor_rollout_ref.rollout.name=vllm', 'actor_rollout_ref.rollout.mode=async', 'actor_rollout_ref.rollout.enforce_eager=False', 'actor_rollout_ref.rollout.temperature=1.0', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'actor_rollout_ref.rollout.max_model_len=30000', 'actor_rollout_ref.rollout.max_num_seqs=64', 'actor_rollout_ref.rollout.n=4', 'actor_rollout_ref.rollout.val_kwargs.n=1', 'actor_rollout_ref.rollout.val_kwargs.temperature=0', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'actor_rollout_ref.actor.entropy_coeff=0.0', 'algorithm.kl_ctrl.kl_coef=0.001', 'rllm.mask_truncated_samples=False', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=deepscaler-agent', 'trainer.experiment_name=swe-agent-rl', 'trainer.val_before_train=False', 'trainer.n_gpus_per_node=4', 'trainer.nnodes=1', 'trainer.save_freq=10', 'trainer.test_freq=10', 'trainer.default_hdfs_dir=null', 'rllm.env.name=swe', 'rllm.agent.name=sweagent', 'rllm.agent.max_steps=50', 'rllm.agent.overlong_filter=True', 'rllm.agent.trajectory_timeout=5400', 'trainer.total_epochs=1000']
Traceback (most recent call last):
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 26, in main
run_ppo_agent(config)
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 46, in run_ppo_agent
ray.get(runner.run.remote(config))
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): ray::TaskRunner.run() (pid=2165352, ip=129.97.152.19, actor_id=1cf5fc28187cc3fcc383855301000000, repr=<train_agent_ppo.TaskRunner object at 0x7f9dbfc6e090>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/rllm/trainer/verl/train_agent_ppo.py", line 221, in run
trainer.fit_agent()
File "/home/rllm/rllm/trainer/verl/agent_ppo_trainer.py", line 160, in fit_agent
for batch_dict in self.train_dataloader:
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 450, in next
return super().next()
^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 734, in next
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 1456, in _next_data
return self._process_data(data, worker_id, state_dict)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/stateful_dataloader.py", line 1543, in _process_data
data.reraise()
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/_utils.py", line 769, in reraise
raise exception
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/rllm/.venv/lib/python3.11/site-packages/torchdata/stateful_dataloader/worker.py", line 242, in _worker_loop
data = fetcher.fetch(index) # type: ignore[union-attr]
^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in
data = [self.dataset[idx] for idx in possibly_batched_index]
~~~~~~~~~~~~^^^^^
File "/home/rllm/.venv/lib/python3.11/site-packages/verl/utils/dataset/rl_dataset.py", line 444, in getitem
index = row_dict.get("extra_info", {}).get("index", 0)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'str' object has no attribute 'get'

Hi I got this error when tring to run train_deepswe_32b.sh. Is there suggestion on the debugs? Thanks

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions