GPU: 4xA100 (80GB each)
Model: Qwen3.5-4B
Script: bash ../openclaw-combine/run_qwen35_4b_openclaw_combine_lora.sh
Description:
The setup is able to run the inference, but when the samples are collected it is triggering, when the training program runs it crashes due to the below error.
Error:
Traceback (most recent call last):
File "/workspace/OpenClaw-RL/slime/train_async.py", line 76, in
train(args)
File "/workspace/OpenClaw-RL/slime/train_async.py", line 47, in train
ray.get(actor_model.async_train(rollout_id, rollout_data_curr_ref))
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/worker.py", line 2981, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/worker.py", line 1012, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AcceleratorError): ray::FSDPTrainRayActor.train() (pid=44754, ip=172.17.0.2, actor_id=2050cafad27d6df5648f7bed02000000, repr=<slime.backends.fsdp_utils.actor.FSDPTrainRayActor object at 0x7f20f3e9b1d0>)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
return inner()
^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1829, in inner
result = forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 843, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 2024, in forward
outputs = self.model(
^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 843, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1775, in forward
outputs = self.language_model(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 917, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/output_capturing.py", line 253, in wrapper
outputs = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1372, in forward
hidden_states = decoder_layer(
^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 93, in call
return super().call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
return inner()
^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1829, in inner
result = forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 865, in forward
hidden_states, _ = self.self_attn(
^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 774, in forward
attn_output, attn_weights = attention_interface(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/integrations/flash_attention.py", line 68, in flash_attention_forward
attn_output = _flash_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/modeling_flash_attention_utils.py", line 677, in _flash_attention_forward
elif is_fa_with_varlen_kwargs or is_fa_with_position_ids:
^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with TORCH_USE_CUDA_DSA` to enable device-side assertions.
GPU: 4xA100 (80GB each)
Model: Qwen3.5-4B
Script: bash ../openclaw-combine/run_qwen35_4b_openclaw_combine_lora.sh
Description:
The setup is able to run the inference, but when the samples are collected it is triggering, when the training program runs it crashes due to the below error.
Error:
Traceback (most recent call last):
File "/workspace/OpenClaw-RL/slime/train_async.py", line 76, in
train(args)
File "/workspace/OpenClaw-RL/slime/train_async.py", line 47, in train
ray.get(actor_model.async_train(rollout_id, rollout_data_curr_ref))
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/worker.py", line 2981, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/ray/_private/worker.py", line 1012, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AcceleratorError): ray::FSDPTrainRayActor.train() (pid=44754, ip=172.17.0.2, actor_id=2050cafad27d6df5648f7bed02000000, repr=<slime.backends.fsdp_utils.actor.FSDPTrainRayActor object at 0x7f20f3e9b1d0>)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
return inner()
^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1829, in inner
result = forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 843, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 2024, in forward
outputs = self.model(
^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 843, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1775, in forward
outputs = self.language_model(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 917, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/utils/output_capturing.py", line 253, in wrapper
outputs = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1372, in forward
hidden_states = decoder_layer(
^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 93, in call
return super().call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
return inner()
^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1829, in inner
result = forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 865, in forward
hidden_states, _ = self.self_attn(
^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 774, in forward
attn_output, attn_weights = attention_interface(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/integrations/flash_attention.py", line 68, in flash_attention_forward
attn_output = _flash_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/OpenClaw-RL/.venv/lib/python3.12/site-packages/transformers/modeling_flash_attention_utils.py", line 677, in _flash_attention_forward
elif is_fa_with_varlen_kwargs or is_fa_with_position_ids:
^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for
cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile withTORCH_USE_CUDA_DSA` to enable device-side assertions.