Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions tests/test_utils/ltp_scripts/run_numerical_tests_amd_mi300x_1n8g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
set -e

pip install -r requirements_ci.txt
pip install mock

# ROCm envs
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HIP_FORCE_DEV_KERNARG=1
export HSA_ENABLE_SDMA=1
export HSA_NO_SCRATCH_RECLAIM=1

# RCCL envs
export NCCL_DEBUG=WARN
export NCCL_SOCKET_IFNAME=eth0
export RCCL_MSCCL_ENABLE=0

# TransformerEngine envs
export NVTE_FLASH_ATTN=0
export NVTE_CK_USES_BWD_V3=0
export NVTE_FUSED_ATTN=1
export NVTE_FUSED_ATTN_CK=1
export NVTE_FUSED_ATTN_AOTRITON=0
export NVTE_UNFUSED_ATTN=0

# Megatron-LM envs
# CRITICAL: 50, ERROR: 40, WARNING: 30, INFO: 20, DEBUG: 10, NOTSET: 0
export MEGATRON_LOGGING_LEVEL=20

TORCHRUN_ARGS=(
--nproc_per_node 1
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 50326
)

clear_previous_runs() {
Comment thread
yzygitzh marked this conversation as resolved.
ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true
sleep 10
}

result_dir="./numerical_test_results/amd_mi300x"
rm -rf ${result_dir}

run_numerical_tests() {
# Get raw module test results
for x in {0..19}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for all 19, should we to use parameter to replace.

do
mkdir -p ${result_dir}/${1}/module_test/${x}
clear_previous_runs
torchrun \
${TORCHRUN_ARGS[@]} \
-m pytest -vxs \
tests/numerical_tests/modules/test_${1}.py \
--result-dir ${result_dir}/${1}/module_test/${x}
done
# Calculate module mean and std
file_names=$(find ${result_dir}/${1}/module_test -type f -printf "%f\n" | sort | uniq)
mkdir -p ${result_dir}/${1}/module_mean_and_std
for name in ${file_names}
do
for x in {0..19}
do
echo "${result_dir}/${1}/module_test/${x}/${name}" >> ${result_dir}/${1}/module_mean_and_std/input_list.txt
done
python \
tests/numerical_tests/utils/module_mean_and_std.py \
--input-list ${result_dir}/${1}/module_mean_and_std/input_list.txt \
--output-mean-file ${result_dir}/${1}/module_mean_and_std/${name}.mean.pt \
--output-std-file ${result_dir}/${1}/module_mean_and_std/${name}.std.pt
rm ${result_dir}/${1}/module_mean_and_std/input_list.txt
done
Comment on lines +57 to +72

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not do the loop in a Python function directly? and it can avoid duplicate code in amd/nvidia sh

# Calculate intra-module similarity
mkdir -p ${result_dir}/${1}/module_similarity
for name in ${file_names}
do
for x in {0..19}
do
for y in {0..19}
Comment on lines +73 to +79

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

do
if [ "$x" -lt "$y" ]; then
python \
tests/numerical_tests/utils/module_similarity.py \
--stats-a ${result_dir}/${1}/module_test/${x}/${name} \
--stats-b ${result_dir}/${1}/module_test/${y}/${name} \
--output-file ${result_dir}/${1}/module_similarity/${name}.${x}-${y}.json
fi
done
done
done
# Remove raw module test results
rm -rf ${result_dir}/${1}/module_test
}

run_numerical_tests attention
run_numerical_tests bda
run_numerical_tests embedding
run_numerical_tests layer_norm
run_numerical_tests logits
run_numerical_tests mlp
run_numerical_tests rope
Comment on lines +95 to +101

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the script first and move these lines to corresponding pr?


unset NVTE_FLASH_ATTN
unset NVTE_CK_USES_BWD_V3
unset NVTE_FUSED_ATTN
unset NVTE_FUSED_ATTN_CK
unset NVTE_FUSED_ATTN_AOTRITON
unset NVTE_UNFUSED_ATTN

run_numerical_tests loss

TORCHRUN_ARGS=(
--nproc_per_node 8
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 50326
)

run_numerical_tests moe_layer
104 changes: 104 additions & 0 deletions tests/test_utils/ltp_scripts/run_numerical_tests_nvidia_h200_1n8g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
set -e

pip install -r requirements_ci.txt
pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4

# CUDA envs
export CUDA_DEVICE_MAX_CONNECTIONS=1

# NCCL envs
export CUDA_DEVICE_ORDER=PCI_BUS_ID
export NCCL_DEBUG=WARN
export NCCL_IB_PCI_RELAXED_ORDERING=1
export NCCL_NET_GDR_LEVEL=5
export NCCL_SOCKET_IFNAME=eth0
export NCCL_TOPO_FILE=/opt/microsoft/ndv5-topo.xml

# Megatron-LM envs
# CRITICAL: 50, ERROR: 40, WARNING: 30, INFO: 20, DEBUG: 10, NOTSET: 0
export MEGATRON_LOGGING_LEVEL=20

TORCHRUN_ARGS=(
--nproc_per_node 1
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 50326
)

clear_previous_runs() {
Comment thread
yzygitzh marked this conversation as resolved.
ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true
sleep 10
}

result_dir="./numerical_test_results/nvidia_h200"

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will there be any issues if two runs use the same dir, maybe add a commit hash in the path?

rm -rf ${result_dir}

run_numerical_tests() {
# Get raw module test results
for x in {0..19}
do
mkdir -p ${result_dir}/${1}/module_test/${x}
clear_previous_runs
torchrun \
${TORCHRUN_ARGS[@]} \
-m pytest -vxs \
tests/numerical_tests/modules/test_${1}.py \
--result-dir ${result_dir}/${1}/module_test/${x}
done
# Calculate module mean and std
file_names=$(find ${result_dir}/${1}/module_test -type f -printf "%f\n" | sort | uniq)
mkdir -p ${result_dir}/${1}/module_mean_and_std
for name in ${file_names}
do
for x in {0..19}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add configuration for running times, e.g.19

do
echo "${result_dir}/${1}/module_test/${x}/${name}" >> ${result_dir}/${1}/module_mean_and_std/input_list.txt
done
python \
tests/numerical_tests/utils/module_mean_and_std.py \
--input-list ${result_dir}/${1}/module_mean_and_std/input_list.txt \
--output-mean-file ${result_dir}/${1}/module_mean_and_std/${name}.mean.pt \
--output-std-file ${result_dir}/${1}/module_mean_and_std/${name}.std.pt
rm ${result_dir}/${1}/module_mean_and_std/input_list.txt
done
# Calculate intra-module similarity
mkdir -p ${result_dir}/${1}/module_similarity
for name in ${file_names}
do
for x in {0..19}
do
for y in {0..19}
do
if [ "$x" -lt "$y" ]; then
python \
tests/numerical_tests/utils/module_similarity.py \
--stats-a ${result_dir}/${1}/module_test/${x}/${name} \
--stats-b ${result_dir}/${1}/module_test/${y}/${name} \
--output-file ${result_dir}/${1}/module_similarity/${name}.${x}-${y}.json
fi
done
done
done
# Remove raw module test results
rm -rf ${result_dir}/${1}/module_test
}

run_numerical_tests attention
run_numerical_tests bda
run_numerical_tests embedding
run_numerical_tests layer_norm
run_numerical_tests logits
run_numerical_tests loss
run_numerical_tests mlp
run_numerical_tests rope

TORCHRUN_ARGS=(
--nproc_per_node 8
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 50326
)

run_numerical_tests moe_layer
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
set -e

stats_dir_a="./numerical_test_results/nvidia_h200"
stats_dir_b="./numerical_test_results/amd_mi300x"
result_dir="./numerical_test_results/nvidia_h200_vs_amd_mi300x"

module_similarity() {
Comment thread
yzygitzh marked this conversation as resolved.
file_names=$(find ${stats_dir_a}/${1}/module_mean_and_std -type f -printf "%f\n" | sort | uniq)
mkdir -p ${result_dir}/${1}/module_similarity
for name in ${file_names}
do
python \
tests/numerical_tests/utils/module_similarity.py \
--stats-a ${stats_dir_a}/${1}/module_mean_and_std/${name} \
--stats-b ${stats_dir_b}/${1}/module_mean_and_std/${name} \
--output-file ${result_dir}/${1}/module_similarity/${name}.json
Comment on lines +12 to +16

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens if there's mismatch? seems there's no assert in the code

done
}

module_similarity attention
module_similarity bda
module_similarity embedding
module_similarity layer_norm
module_similarity logits
module_similarity loss
module_similarity mlp
module_similarity moe_layer
module_similarity rope
Loading