-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathauto_train_ddp.sh
More file actions
executable file
·132 lines (109 loc) · 3.72 KB
/
Copy pathauto_train_ddp.sh
File metadata and controls
executable file
·132 lines (109 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
source ./task_list.conf
# port=29509
# GPU_NUM=2 # 该任务需要的GPU个数
# THRESHOLD=0.05 # 当GPU内存利用率低于THRESHOLD,视为该GPU为空闲
# WAIT_MODE=true # true:循环等待GPU满足。 false:GPU不满足直接退出
# WAIT_INTERVAL=60 # 循环等待间隔,单位为秒,只在WAIT_MODE=true时起效。
LOCK_DIR="${HOME}/.gpu_locks" # 锁文件目录
# 创建锁目录
mkdir -p "$LOCK_DIR"
# 清理函数(退出时删除自己的锁文件)
cleanup() {
if [ -n "$ACQUIRED_GPUS" ]; then
for gpu in $(echo "$ACQUIRED_GPUS" | tr ',' ' '); do
rm -f "$LOCK_DIR/gpu_${gpu}.lock"
done
fi
exit
}
trap cleanup EXIT INT TERM
# 检查nvidia-smi和bc
if ! command -v nvidia-smi &> /dev/null || ! command -v bc &> /dev/null; then
echo "Error: Required commands (nvidia-smi/bc) not found."
exit 1
fi
# 获取物理GPU内存使用情况
get_physical_gpu_memory() {
nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null
}
# 尝试获取GPU锁
try_acquire_gpu_lock() {
local gpu=$1
local lockfile="$LOCK_DIR/gpu_${gpu}.lock"
# 使用mkdir原子操作实现锁
if mkdir "$lockfile.lock" 2>/dev/null; then
# 检查GPU是否真的可用
local used total usage
read used total <<< $(get_physical_gpu_memory | sed -n "$((gpu+1))p" | awk -F',' '{print $1, $2}')
usage=$(echo "scale=4; $used / $total" | bc)
if [ $(echo "$usage < $THRESHOLD" | bc) -eq 1 ] && [ ! -f "$lockfile" ]; then
touch "$lockfile"
echo "$BASHPID" > "$lockfile" # 写入当前进程ID
rmdir "$lockfile.lock"
return 0
fi
rmdir "$lockfile.lock"
fi
return 1
}
# 释放GPU锁
release_gpu_lock() {
local gpu=$1
rm -f "$LOCK_DIR/gpu_${gpu}.lock"
}
# 查找并锁定可用GPU
find_and_lock_gpus() {
local needed=$1
local acquired=()
local gpu_info=$(get_physical_gpu_memory)
local total_gpus=$(echo "$gpu_info" | wc -l)
for ((i=0; i<total_gpus; i++)); do
if try_acquire_gpu_lock $i; then
acquired+=($i)
if [ ${#acquired[@]} -eq $needed ]; then
ACQUIRED_GPUS=$(IFS=','; echo "${acquired[*]}")
return 0
fi
fi
done
# 释放已经获取的GPU锁
for gpu in "${acquired[@]}"; do
release_gpu_lock $gpu
done
return 1
}
# 主分配逻辑
if $WAIT_MODE; then
# 等待模式
while true; do
if find_and_lock_gpus $GPU_NUM; then
export CUDA_VISIBLE_DEVICES=$ACQUIRED_GPUS
echo "$(date '+%Y-%m-%d %H:%M:%S') - exp: ${result_dir} - Acquired GPUs: $ACQUIRED_GPUS"
break
fi
echo "$(date '+%Y-%m-%d %H:%M:%S') - exp: ${result_dir} - Waiting for $GPU_NUM GPUs (interval: ${WAIT_INTERVAL}s)..."
sleep $WAIT_INTERVAL
done
else
# 非等待模式
if find_and_lock_gpus $GPU_NUM; then
export CUDA_VISIBLE_DEVICES=$ACQUIRED_GPUS
echo "$(date '+%Y-%m-%d %H:%M:%S') - Acquired GPUs: $ACQUIRED_GPUS"
else
echo "$(date '+%Y-%m-%d %H:%M:%S') - Failed to acquire $GPU_NUM GPUs"
exit 1
fi
fi
log_dir=.
save_dir=${log_dir}/${result_dir}
mkdir -p ${save_dir}
torchrun --nproc-per-node ${GPU_NUM} --master_port ${port} main.py --config ${save_dir}/config.yaml --output_dir ${save_dir} --eval_ppl --use_ddp
result_dir=./${result_dir}
python main.py \
--config ${result_dir}/config.yaml \
--output_dir ${result_dir} \
--test_mode --weight_merge \
--lm_eval_batch_size auto:4 \
--tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande,mmlu,lambada \
--eval_ppl \
--resume ${result_dir}/slider_parameters.pth