forked from ByteDance-Seed/VeOmni
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.sh
More file actions
42 lines (37 loc) · 1.24 KB
/
train.sh
File metadata and controls
42 lines (37 loc) · 1.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
set -x
set -o pipefail
export TOKENIZERS_PARALLELISM=false
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
NNODES=${NNODES:=1}
if command -v nvidia-smi &> /dev/null && nvidia-smi --list-gpus &> /dev/null; then
# GPU
if [[ -n "${CUDA_VISIBLE_DEVICES}" ]]; then
NPROC_PER_NODE=${NPROC_PER_NODE:=$(echo "${CUDA_VISIBLE_DEVICES}" | tr ',' '\n' | wc -l)}
else
NPROC_PER_NODE=${NPROC_PER_NODE:=$(nvidia-smi --list-gpus | wc -l)}
fi
export NCCL_DEBUG=WARN
else
# NPU
if [[ -n "${ASCEND_RT_VISIBLE_DEVICES}" ]]; then
NPROC_PER_NODE=${NPROC_PER_NODE:=$(echo "${ASCEND_RT_VISIBLE_DEVICES}" | tr ',' '\n' | wc -l)}
else
NPROC_PER_NODE=${NPROC_PER_NODE:=$(ls -l /dev/davinci* | grep -v "davinci_manager" | wc -l)}
fi
# NPU env that may optimize performance
export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:='expandable_segments:True'}
fi
NODE_RANK=${NODE_RANK:=0}
MASTER_ADDR=${MASTER_ADDR:=0.0.0.0}
MASTER_PORT=${MASTER_PORT:=12345}
if [[ "$NNODES" == "1" ]]; then
additional_args="$additional_args --standalone"
else
additional_args="--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT}"
fi
torchrun \
--nnodes=$NNODES \
--nproc-per-node=$NPROC_PER_NODE \
--node-rank=$NODE_RANK \
$additional_args $@ 2>&1 | tee log.txt