-
Notifications
You must be signed in to change notification settings - Fork 4
Tests - Add LTP scripts to run module-level numerical tests #79
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| set -e | ||
|
|
||
| pip install -r requirements_ci.txt | ||
| pip install mock | ||
|
|
||
| # ROCm envs | ||
| export CUDA_DEVICE_MAX_CONNECTIONS=1 | ||
| export HIP_FORCE_DEV_KERNARG=1 | ||
| export HSA_ENABLE_SDMA=1 | ||
| export HSA_NO_SCRATCH_RECLAIM=1 | ||
|
|
||
| # RCCL envs | ||
| export NCCL_DEBUG=WARN | ||
| export NCCL_SOCKET_IFNAME=eth0 | ||
| export RCCL_MSCCL_ENABLE=0 | ||
|
|
||
| # TransformerEngine envs | ||
| export NVTE_FLASH_ATTN=0 | ||
| export NVTE_CK_USES_BWD_V3=0 | ||
| export NVTE_FUSED_ATTN=1 | ||
| export NVTE_FUSED_ATTN_CK=1 | ||
| export NVTE_FUSED_ATTN_AOTRITON=0 | ||
| export NVTE_UNFUSED_ATTN=0 | ||
|
|
||
| # Megatron-LM envs | ||
| # CRITICAL: 50, ERROR: 40, WARNING: 30, INFO: 20, DEBUG: 10, NOTSET: 0 | ||
| export MEGATRON_LOGGING_LEVEL=20 | ||
|
|
||
| TORCHRUN_ARGS=( | ||
| --nproc_per_node 1 | ||
| --nnodes 1 | ||
| --node_rank 0 | ||
| --master_addr localhost | ||
| --master_port 50326 | ||
| ) | ||
|
|
||
| clear_previous_runs() { | ||
| ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true | ||
| sleep 10 | ||
| } | ||
|
|
||
| result_dir="./numerical_test_results/amd_mi300x" | ||
| rm -rf ${result_dir} | ||
|
|
||
| run_numerical_tests() { | ||
| # Get raw module test results | ||
| for x in {0..19} | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for all 19, should we to use parameter to replace. |
||
| do | ||
| mkdir -p ${result_dir}/${1}/module_test/${x} | ||
| clear_previous_runs | ||
| torchrun \ | ||
| ${TORCHRUN_ARGS[@]} \ | ||
| -m pytest -vxs \ | ||
| tests/numerical_tests/modules/test_${1}.py \ | ||
| --result-dir ${result_dir}/${1}/module_test/${x} | ||
| done | ||
| # Calculate module mean and std | ||
| file_names=$(find ${result_dir}/${1}/module_test -type f -printf "%f\n" | sort | uniq) | ||
| mkdir -p ${result_dir}/${1}/module_mean_and_std | ||
| for name in ${file_names} | ||
| do | ||
| for x in {0..19} | ||
| do | ||
| echo "${result_dir}/${1}/module_test/${x}/${name}" >> ${result_dir}/${1}/module_mean_and_std/input_list.txt | ||
| done | ||
| python \ | ||
| tests/numerical_tests/utils/module_mean_and_std.py \ | ||
| --input-list ${result_dir}/${1}/module_mean_and_std/input_list.txt \ | ||
| --output-mean-file ${result_dir}/${1}/module_mean_and_std/${name}.mean.pt \ | ||
| --output-std-file ${result_dir}/${1}/module_mean_and_std/${name}.std.pt | ||
| rm ${result_dir}/${1}/module_mean_and_std/input_list.txt | ||
| done | ||
|
Comment on lines
+57
to
+72
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not do the loop in a Python function directly? and it can avoid duplicate code in amd/nvidia sh |
||
| # Calculate intra-module similarity | ||
| mkdir -p ${result_dir}/${1}/module_similarity | ||
| for name in ${file_names} | ||
| do | ||
| for x in {0..19} | ||
| do | ||
| for y in {0..19} | ||
|
Comment on lines
+73
to
+79
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
| do | ||
| if [ "$x" -lt "$y" ]; then | ||
| python \ | ||
| tests/numerical_tests/utils/module_similarity.py \ | ||
| --stats-a ${result_dir}/${1}/module_test/${x}/${name} \ | ||
| --stats-b ${result_dir}/${1}/module_test/${y}/${name} \ | ||
| --output-file ${result_dir}/${1}/module_similarity/${name}.${x}-${y}.json | ||
| fi | ||
| done | ||
| done | ||
| done | ||
| # Remove raw module test results | ||
| rm -rf ${result_dir}/${1}/module_test | ||
| } | ||
|
|
||
| run_numerical_tests attention | ||
| run_numerical_tests bda | ||
| run_numerical_tests embedding | ||
| run_numerical_tests layer_norm | ||
| run_numerical_tests logits | ||
| run_numerical_tests mlp | ||
| run_numerical_tests rope | ||
|
Comment on lines
+95
to
+101
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add the script first and move these lines to corresponding pr? |
||
|
|
||
| unset NVTE_FLASH_ATTN | ||
| unset NVTE_CK_USES_BWD_V3 | ||
| unset NVTE_FUSED_ATTN | ||
| unset NVTE_FUSED_ATTN_CK | ||
| unset NVTE_FUSED_ATTN_AOTRITON | ||
| unset NVTE_UNFUSED_ATTN | ||
|
|
||
| run_numerical_tests loss | ||
|
|
||
| TORCHRUN_ARGS=( | ||
| --nproc_per_node 8 | ||
| --nnodes 1 | ||
| --node_rank 0 | ||
| --master_addr localhost | ||
| --master_port 50326 | ||
| ) | ||
|
|
||
| run_numerical_tests moe_layer | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| set -e | ||
|
|
||
| pip install -r requirements_ci.txt | ||
| pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4 | ||
|
|
||
| # CUDA envs | ||
| export CUDA_DEVICE_MAX_CONNECTIONS=1 | ||
|
|
||
| # NCCL envs | ||
| export CUDA_DEVICE_ORDER=PCI_BUS_ID | ||
| export NCCL_DEBUG=WARN | ||
| export NCCL_IB_PCI_RELAXED_ORDERING=1 | ||
| export NCCL_NET_GDR_LEVEL=5 | ||
| export NCCL_SOCKET_IFNAME=eth0 | ||
| export NCCL_TOPO_FILE=/opt/microsoft/ndv5-topo.xml | ||
|
|
||
| # Megatron-LM envs | ||
| # CRITICAL: 50, ERROR: 40, WARNING: 30, INFO: 20, DEBUG: 10, NOTSET: 0 | ||
| export MEGATRON_LOGGING_LEVEL=20 | ||
|
|
||
| TORCHRUN_ARGS=( | ||
| --nproc_per_node 1 | ||
| --nnodes 1 | ||
| --node_rank 0 | ||
| --master_addr localhost | ||
| --master_port 50326 | ||
| ) | ||
|
|
||
| clear_previous_runs() { | ||
|
yzygitzh marked this conversation as resolved.
|
||
| ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true | ||
| sleep 10 | ||
| } | ||
|
|
||
| result_dir="./numerical_test_results/nvidia_h200" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will there be any issues if two runs use the same dir, maybe add a commit hash in the path? |
||
| rm -rf ${result_dir} | ||
|
|
||
| run_numerical_tests() { | ||
| # Get raw module test results | ||
| for x in {0..19} | ||
| do | ||
| mkdir -p ${result_dir}/${1}/module_test/${x} | ||
| clear_previous_runs | ||
| torchrun \ | ||
| ${TORCHRUN_ARGS[@]} \ | ||
| -m pytest -vxs \ | ||
| tests/numerical_tests/modules/test_${1}.py \ | ||
| --result-dir ${result_dir}/${1}/module_test/${x} | ||
| done | ||
| # Calculate module mean and std | ||
| file_names=$(find ${result_dir}/${1}/module_test -type f -printf "%f\n" | sort | uniq) | ||
| mkdir -p ${result_dir}/${1}/module_mean_and_std | ||
| for name in ${file_names} | ||
| do | ||
| for x in {0..19} | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add configuration for running times, e.g.19 |
||
| do | ||
| echo "${result_dir}/${1}/module_test/${x}/${name}" >> ${result_dir}/${1}/module_mean_and_std/input_list.txt | ||
| done | ||
| python \ | ||
| tests/numerical_tests/utils/module_mean_and_std.py \ | ||
| --input-list ${result_dir}/${1}/module_mean_and_std/input_list.txt \ | ||
| --output-mean-file ${result_dir}/${1}/module_mean_and_std/${name}.mean.pt \ | ||
| --output-std-file ${result_dir}/${1}/module_mean_and_std/${name}.std.pt | ||
| rm ${result_dir}/${1}/module_mean_and_std/input_list.txt | ||
| done | ||
| # Calculate intra-module similarity | ||
| mkdir -p ${result_dir}/${1}/module_similarity | ||
| for name in ${file_names} | ||
| do | ||
| for x in {0..19} | ||
| do | ||
| for y in {0..19} | ||
| do | ||
| if [ "$x" -lt "$y" ]; then | ||
| python \ | ||
| tests/numerical_tests/utils/module_similarity.py \ | ||
| --stats-a ${result_dir}/${1}/module_test/${x}/${name} \ | ||
| --stats-b ${result_dir}/${1}/module_test/${y}/${name} \ | ||
| --output-file ${result_dir}/${1}/module_similarity/${name}.${x}-${y}.json | ||
| fi | ||
| done | ||
| done | ||
| done | ||
| # Remove raw module test results | ||
| rm -rf ${result_dir}/${1}/module_test | ||
| } | ||
|
|
||
| run_numerical_tests attention | ||
| run_numerical_tests bda | ||
| run_numerical_tests embedding | ||
| run_numerical_tests layer_norm | ||
| run_numerical_tests logits | ||
| run_numerical_tests loss | ||
| run_numerical_tests mlp | ||
| run_numerical_tests rope | ||
|
|
||
| TORCHRUN_ARGS=( | ||
| --nproc_per_node 8 | ||
| --nnodes 1 | ||
| --node_rank 0 | ||
| --master_addr localhost | ||
| --master_port 50326 | ||
| ) | ||
|
|
||
| run_numerical_tests moe_layer | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| set -e | ||
|
|
||
| stats_dir_a="./numerical_test_results/nvidia_h200" | ||
| stats_dir_b="./numerical_test_results/amd_mi300x" | ||
| result_dir="./numerical_test_results/nvidia_h200_vs_amd_mi300x" | ||
|
|
||
| module_similarity() { | ||
|
yzygitzh marked this conversation as resolved.
|
||
| file_names=$(find ${stats_dir_a}/${1}/module_mean_and_std -type f -printf "%f\n" | sort | uniq) | ||
| mkdir -p ${result_dir}/${1}/module_similarity | ||
| for name in ${file_names} | ||
| do | ||
| python \ | ||
| tests/numerical_tests/utils/module_similarity.py \ | ||
| --stats-a ${stats_dir_a}/${1}/module_mean_and_std/${name} \ | ||
| --stats-b ${stats_dir_b}/${1}/module_mean_and_std/${name} \ | ||
| --output-file ${result_dir}/${1}/module_similarity/${name}.json | ||
|
Comment on lines
+12
to
+16
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what happens if there's mismatch? seems there's no assert in the code |
||
| done | ||
| } | ||
|
|
||
| module_similarity attention | ||
| module_similarity bda | ||
| module_similarity embedding | ||
| module_similarity layer_norm | ||
| module_similarity logits | ||
| module_similarity loss | ||
| module_similarity mlp | ||
| module_similarity moe_layer | ||
| module_similarity rope | ||
Uh oh!
There was an error while loading. Please reload this page.