From 865487749b295128f5e005e89d6998618d6a2604 Mon Sep 17 00:00:00 2001 From: Tridu33 <tridu33@qq.com> Date: Sat, 16 Nov 2024 16:51:57 +0800 Subject: [PATCH 1/3] bugfix: fix issues https://gitee.com/mindspore/mindspore/issues/IB455C?from=project-issue , where -worker_num and --local_worker_num must be the same number, like 8 or 4. --- docs/sample_code/startup_method/msrun_1.sh | 2 +- docs/sample_code/startup_method/msrun_2.sh | 2 +- docs/sample_code/startup_method/msrun_single.sh | 2 +- docs/sample_code/startup_method/run_mpirun_1.sh | 2 +- docs/sample_code/startup_method/run_mpirun_2.sh | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/sample_code/startup_method/msrun_1.sh b/docs/sample_code/startup_method/msrun_1.sh index 76d4602c33..9c9fd2334e 100644 --- a/docs/sample_code/startup_method/msrun_1.sh +++ b/docs/sample_code/startup_method/msrun_1.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=4 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=0 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=8 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=0 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/msrun_2.sh b/docs/sample_code/startup_method/msrun_2.sh index 210da5f81d..19e3bdeeba 100644 --- a/docs/sample_code/startup_method/msrun_2.sh +++ b/docs/sample_code/startup_method/msrun_2.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=4 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=1 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=8 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=1 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/msrun_single.sh b/docs/sample_code/startup_method/msrun_single.sh index 6d0cb6007f..7bb7f0ff5c 100644 --- a/docs/sample_code/startup_method/msrun_single.sh +++ b/docs/sample_code/startup_method/msrun_single.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=4 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=8 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/run_mpirun_1.sh b/docs/sample_code/startup_method/run_mpirun_1.sh index db3689ab9f..6016cedddd 100644 --- a/docs/sample_code/startup_method/run_mpirun_1.sh +++ b/docs/sample_code/startup_method/run_mpirun_1.sh @@ -14,5 +14,5 @@ if [ ! -d "${EXEC_PATH}/MNIST_Data" ]; then fi export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ -mpirun -n 8 -H 192.168.*.1:8,192.168.*.2:8 --output-filename log_output \ +mpirun -n 8 -H 192.168.*.1:8,192.168.*.2:8 --output-filename ./mpirun_log/log_output \ --merge-stderr-to-stdout python net.py diff --git a/docs/sample_code/startup_method/run_mpirun_2.sh b/docs/sample_code/startup_method/run_mpirun_2.sh index e5ca9cdeb5..101c28615e 100644 --- a/docs/sample_code/startup_method/run_mpirun_2.sh +++ b/docs/sample_code/startup_method/run_mpirun_2.sh @@ -15,5 +15,5 @@ fi export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ HOSTFILE=$1 -mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output \ +mpirun -n 16 --hostfile $HOSTFILE --output-filename ./mpirun_log/log_output \ --merge-stderr-to-stdout python net.py -- Gitee From 549b5fa4b5eefb67c22965e3e2e7649cc0c4e244 Mon Sep 17 00:00:00 2001 From: Tridu33 <tridu33@qq.com> Date: Thu, 9 Jan 2025 09:57:38 +0800 Subject: [PATCH 2/3] refactor:Added the description of address mapping coordinate filling rules for high-order operators in parallel --- .../parallel/advanced_operator_parallel.md | 12 ++++++++++++ .../parallel/advanced_operator_parallel.md | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/docs/mindspore/source_en/model_train/parallel/advanced_operator_parallel.md b/docs/mindspore/source_en/model_train/parallel/advanced_operator_parallel.md index 02dbe7712c..9ca3b5c74e 100644 --- a/docs/mindspore/source_en/model_train/parallel/advanced_operator_parallel.md +++ b/docs/mindspore/source_en/model_train/parallel/advanced_operator_parallel.md @@ -44,6 +44,18 @@ a_strategy = layout("mp", ("sp", "dp")) Notice that the "[a0, a1, a2, a3]" of the tensor a is sliced twice to the "sp" and "mp" axes of the device, so that the result comes out as:  +The specific coordinate mapping principle of the above icon is `a[x][y]=>Rank[mp]([sp][dp])`, the coordinates of $a$ y=0,1,2,3 start to fill from the dp dimension, and start to fill the next dimension (sp dimension) after the base number of each dimension is full.Specifically, the following table correspondence is as: + +| Arr | mx->p | y->(sp,dp) | => | dp | sp | mp | Rank | +| --- | ----- | ---------- | --- | --- | --- | --- | ---- | +| a0 | 0 | 0 | | 0 | 0 | 0 | r0 | +| a1 | 0 | 1 | | 1 | 0 | 0 | r4 | +| a2 | 0 | 2 | | 0 | 1 | 0 | r2 | +| a3 | 0 | 3 | | 1 | 1 | 0 | r6 | +| a4 | 1 | 0 | | 0 | 0 | 1 | r1 | +| a5 | 1 | 1 | | 1 | 0 | 1 | r5 | +| a6 | 1 | 2 | | 0 | 1 | 1 | r3 | +| a7 | 1 | 3 | | 1 | 1 | 1 | r7 | The following is exemplified by a concrete example in which the user computes a two-dimensional matrix multiplication over 8 cards: `Y = (X * W)` , where the devices are organized according to `2 * 2 * 2`, and the cut of X coincides with the cut of the tensor a described above: diff --git a/docs/mindspore/source_zh_cn/model_train/parallel/advanced_operator_parallel.md b/docs/mindspore/source_zh_cn/model_train/parallel/advanced_operator_parallel.md index daab4a65b3..5bcf286d55 100644 --- a/docs/mindspore/source_zh_cn/model_train/parallel/advanced_operator_parallel.md +++ b/docs/mindspore/source_zh_cn/model_train/parallel/advanced_operator_parallel.md @@ -43,6 +43,20 @@ a_strategy = layout("mp", ("sp", "dp"))  +上述图标具体坐标映射原理是`a[x][y]=>Rank[mp]([sp][dp])`,$a$的坐标y=0,1,2,3从dp维开始填充,满了每一维度的进制数之后开始填充到下一维(sp维), +具体是下面的表格对应关系: + +| Arr | mx->p | y->(sp,dp) | => | dp | sp | mp | Rank | +| --- | ----- | ---------- | --- | --- | --- | --- | ---- | +| a0 | 0 | 0 | | 0 | 0 | 0 | r0 | +| a1 | 0 | 1 | | 1 | 0 | 0 | r4 | +| a2 | 0 | 2 | | 0 | 1 | 0 | r2 | +| a3 | 0 | 3 | | 1 | 1 | 0 | r6 | +| a4 | 1 | 0 | | 0 | 0 | 1 | r1 | +| a5 | 1 | 1 | | 1 | 0 | 1 | r5 | +| a6 | 1 | 2 | | 0 | 1 | 1 | r3 | +| a7 | 1 | 3 | | 1 | 1 | 1 | r7 | + 下面以一个具体的例子进行示例,用户在8个卡上计算二维矩阵乘:`Y = (X * W)` ,其中设备按照`2 * 2 * 2`进行组织,X的切分与上述的张量a切分一致: ```python -- Gitee From 116e33f0a0062e6cfdb56ed80ea754c419e42920 Mon Sep 17 00:00:00 2001 From: Tridu33 <tridu33@qq.com> Date: Thu, 9 Jan 2025 10:03:09 +0800 Subject: [PATCH 3/3] Revert "bugfix: fix issues https://gitee.com/mindspore/mindspore/issues/IB455C?from=project-issue , where -worker_num and --local_worker_num must be the same number, like 8 or 4." This reverts commit 865487749b295128f5e005e89d6998618d6a2604. modified: docs/sample_code/startup_method/msrun_1.sh modified: docs/sample_code/startup_method/msrun_2.sh modified: docs/sample_code/startup_method/msrun_single.sh modified: docs/sample_code/startup_method/run_mpirun_1.sh modified: docs/sample_code/startup_method/run_mpirun_2.sh --- docs/sample_code/startup_method/msrun_1.sh | 2 +- docs/sample_code/startup_method/msrun_2.sh | 2 +- docs/sample_code/startup_method/msrun_single.sh | 2 +- docs/sample_code/startup_method/run_mpirun_1.sh | 2 +- docs/sample_code/startup_method/run_mpirun_2.sh | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/sample_code/startup_method/msrun_1.sh b/docs/sample_code/startup_method/msrun_1.sh index 9c9fd2334e..76d4602c33 100644 --- a/docs/sample_code/startup_method/msrun_1.sh +++ b/docs/sample_code/startup_method/msrun_1.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=8 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=0 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=4 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=0 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/msrun_2.sh b/docs/sample_code/startup_method/msrun_2.sh index 19e3bdeeba..210da5f81d 100644 --- a/docs/sample_code/startup_method/msrun_2.sh +++ b/docs/sample_code/startup_method/msrun_2.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=8 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=1 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=4 --master_addr=<node_1 ip address> --master_port=8118 --node_rank=1 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/msrun_single.sh b/docs/sample_code/startup_method/msrun_single.sh index d7941671d6..619feafef9 100644 --- a/docs/sample_code/startup_method/msrun_single.sh +++ b/docs/sample_code/startup_method/msrun_single.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=8 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=4 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/run_mpirun_1.sh b/docs/sample_code/startup_method/run_mpirun_1.sh index 6016cedddd..db3689ab9f 100644 --- a/docs/sample_code/startup_method/run_mpirun_1.sh +++ b/docs/sample_code/startup_method/run_mpirun_1.sh @@ -14,5 +14,5 @@ if [ ! -d "${EXEC_PATH}/MNIST_Data" ]; then fi export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ -mpirun -n 8 -H 192.168.*.1:8,192.168.*.2:8 --output-filename ./mpirun_log/log_output \ +mpirun -n 8 -H 192.168.*.1:8,192.168.*.2:8 --output-filename log_output \ --merge-stderr-to-stdout python net.py diff --git a/docs/sample_code/startup_method/run_mpirun_2.sh b/docs/sample_code/startup_method/run_mpirun_2.sh index 101c28615e..e5ca9cdeb5 100644 --- a/docs/sample_code/startup_method/run_mpirun_2.sh +++ b/docs/sample_code/startup_method/run_mpirun_2.sh @@ -15,5 +15,5 @@ fi export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ HOSTFILE=$1 -mpirun -n 16 --hostfile $HOSTFILE --output-filename ./mpirun_log/log_output \ +mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output \ --merge-stderr-to-stdout python net.py -- Gitee