Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
risemeup1 committed Apr 16, 2024
1 parent ee88c12 commit 667a91f
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 35 deletions.
2 changes: 2 additions & 0 deletions llm/llama/auto_parallel/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
print_rank_0,
)

print("---")


def add_start_docstrings(*docstr):
def docstring_decorator(fn):
Expand Down
2 changes: 1 addition & 1 deletion model_zoo/gpt-3/tasks/gpt/train_pir.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, "../../")))

print('---')
from ppfleetx.data import build_dataloader
from ppfleetx.distributed.apis import env
from ppfleetx.models import build_module
Expand Down
68 changes: 34 additions & 34 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ function gpt_auto_recompute_bs16_fp32_DP1-MP1-PP1() {
loss_base=10.507633305
ips_base=3518
mem_base=11750.6
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -144,7 +144,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8() {
loss_base=10.570028400
ips_base=35050
mem_base=1988.9
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -181,7 +181,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8_pir() {
loss_base=10.570028400
ips_base=35050
mem_base=1988.9
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -217,7 +217,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP1-MP2-PP4() {
loss_base=10.700293922
ips_base=32518
mem_base=1535.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -253,7 +253,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2() {
loss_base=10.672543240
ips_base=18681
mem_base=2135.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -290,7 +290,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_pir() {
loss_base=10.672543240
ips_base=18681
mem_base=2135.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -326,7 +326,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1() {
loss_base=10.720068359
ips_base=15232
mem_base=1999.2
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -363,7 +363,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1_pir() {
loss_base=10.720068359
ips_base=15232
mem_base=1999.2
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -399,7 +399,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage2() {
loss_base=10.720078850
ips_base=15571
mem_base=1999.2
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -435,7 +435,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage3() {
loss_base=10.681921577
ips_base=13813
mem_base=1747.6
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -471,7 +471,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1() {
loss_base=10.579057693
ips_base=19822
mem_base=1709.8
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -508,7 +508,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1_pir() {
loss_base=10.579057693
ips_base=19822
mem_base=1709.8
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -544,7 +544,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage2() {
loss_base=10.579057693
ips_base=20170
mem_base=1709.8
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -580,7 +580,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage3() {
loss_base=10.585316849
ips_base=15742
mem_base=1591.6
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -616,7 +616,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage1() {
loss_base=10.672568035
ips_base=19461
mem_base=1384.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -652,7 +652,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2() {
loss_base=10.672568035
ips_base=19652
mem_base=1384.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -689,7 +689,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2_pir() {
loss_base=10.672568035
ips_base=19652
mem_base=1384.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -725,7 +725,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3() {
loss_base=10.696336079
ips_base=16613
mem_base=1280.5
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -762,7 +762,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3_pir() {
loss_base=10.696336079
ips_base=16613
mem_base=1280.5
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -844,7 +844,7 @@ function gpt_auto_sp_acc_check() {
ips_base=-1
mem_base=-1
echo "result: loss_spTrue=$loss loss_spFasle=$loss_base"
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -910,7 +910,7 @@ function llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1() {
loss_base=9.52110565
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -976,7 +976,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1() {
loss_base=9.42011833
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1042,7 +1042,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1() {
loss_base=9.44299471
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1108,7 +1108,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2() {
loss_base=9.45936012
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1176,7 +1176,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2
loss_base=9.46707726
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1245,7 +1245,7 @@ function llama_static_auto_recompute_bs16_fp16_DP2-MP2-PP2-VPP2-Sharding2_stage2
loss_base=10.0859375
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1312,7 +1312,7 @@ function llama_dygraph_auto_bs8_fp32_DP2() {
loss_base=9.53389835
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1379,7 +1379,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2() {
loss_base=9.39066124
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1446,7 +1446,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
loss_base=9.38235474
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1514,7 +1514,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
loss_base=9.38256836
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1582,7 +1582,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
loss_md5_base=0ebf68698887b33b33a46518621cf412
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1652,7 +1652,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
loss_md5_base=6df87d01bd08113a92930f6349514b35
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1722,7 +1722,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
loss_md5_base=6cb4e151b35f026190df90ab240d9a95
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1792,7 +1792,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
loss_md5_base=e82a1f5668870d18a2d45b3ee0a25386
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down

0 comments on commit 667a91f

Please sign in to comment.