While training, it is more likely that NaNs are introduced by the data or the scene-centric implementation when prepare using trajadata. I am using CUDA 12.X, is it the problem?
Sanity Checking: 0%| | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "ccdiff/examples/train.py", line 336, in
main(default_config, auto_remove_exp_dir=args.remove_exp_dir, debug=args.debug)
File "ccdiff/examples/train.py", line 216, in main
trainer.fit(model=model, datamodule=datamodule)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 582, in fit
call._call_and_handle_interrupt(
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 624, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1061, in _run
results = self._run_stage()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1140, in _run_stage
self._run_train()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1153, in _run_train
self._run_sanity_check()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1225, in _run_sanity_check
val_loop.run()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 137, in advance
output = self._evaluation_step(**kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 234, in _evaluation_step
output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1443, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in validation_step
return self.model.validation_step(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/ccdiff/algos/algos.py", line 331, in validation_step
batch = batch_utils().parse_batch(batch)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/third_party/tbsim/tbsim/utils/batch_utils.py", line 119, in parse_batch
return av_utils.parse_trajdata_batch(data_batch)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/third_party/tbsim/tbsim/utils/trajdata_utils.py", line 528, in parse_trajdata_batch
d = parse_scene_centric(batch)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/third_party/tbsim/tbsim/utils/trajdata_utils.py", line 259, in parse_scene_centric
assert torch.all(centered_state[:, -1] == centered_state.heading[...,0])
While training, it is more likely that NaNs are introduced by the data or the scene-centric implementation when prepare using trajadata. I am using CUDA 12.X, is it the problem?
Sanity Checking: 0%| | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "ccdiff/examples/train.py", line 336, in
main(default_config, auto_remove_exp_dir=args.remove_exp_dir, debug=args.debug)
File "ccdiff/examples/train.py", line 216, in main
trainer.fit(model=model, datamodule=datamodule)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 582, in fit
call._call_and_handle_interrupt(
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 624, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1061, in _run
results = self._run_stage()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1140, in _run_stage
self._run_train()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1153, in _run_train
self._run_sanity_check()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1225, in _run_sanity_check
val_loop.run()
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 137, in advance
output = self._evaluation_step(**kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 234, in _evaluation_step
output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1443, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in validation_step
return self.model.validation_step(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/ccdiff/algos/algos.py", line 331, in validation_step
batch = batch_utils().parse_batch(batch)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/third_party/tbsim/tbsim/utils/batch_utils.py", line 119, in parse_batch
return av_utils.parse_trajdata_batch(data_batch)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/conda_envs/ccdiff/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/third_party/tbsim/tbsim/utils/trajdata_utils.py", line 528, in parse_trajdata_batch
d = parse_scene_centric(batch)
File "/lustre/nvwulf/projects/QinGroup-nvwulf/keli/CCD/CCDiff/third_party/tbsim/tbsim/utils/trajdata_utils.py", line 259, in parse_scene_centric
assert torch.all(centered_state[:, -1] == centered_state.heading[...,0])