Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modify run_ci.sh #8078

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llm/llama/auto_parallel/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
print_rank_0,
)

print("---")


def add_start_docstrings(*docstr):
def docstring_decorator(fn):
Expand Down
2 changes: 1 addition & 1 deletion model_zoo/gpt-3/tasks/gpt/train_pir.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, "../../")))

print('---')
from ppfleetx.data import build_dataloader
from ppfleetx.distributed.apis import env
from ppfleetx.models import build_module
Expand Down
68 changes: 34 additions & 34 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ function gpt_auto_recompute_bs16_fp32_DP1-MP1-PP1() {
loss_base=10.507633305
ips_base=3518
mem_base=11750.6
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -144,7 +144,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8() {
loss_base=10.570028400
ips_base=35050
mem_base=1988.9
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -181,7 +181,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8_pir() {
loss_base=10.570028400
ips_base=35050
mem_base=1988.9
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -217,7 +217,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP1-MP2-PP4() {
loss_base=10.700293922
ips_base=32518
mem_base=1535.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -253,7 +253,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2() {
loss_base=10.672543240
ips_base=18681
mem_base=2135.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -290,7 +290,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_pir() {
loss_base=10.672543240
ips_base=18681
mem_base=2135.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -326,7 +326,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1() {
loss_base=10.720068359
ips_base=15232
mem_base=1999.2
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -363,7 +363,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1_pir() {
loss_base=10.720068359
ips_base=15232
mem_base=1999.2
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -399,7 +399,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage2() {
loss_base=10.720078850
ips_base=15571
mem_base=1999.2
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -435,7 +435,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage3() {
loss_base=10.681921577
ips_base=13813
mem_base=1747.6
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -471,7 +471,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1() {
loss_base=10.579057693
ips_base=19822
mem_base=1709.8
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -508,7 +508,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1_pir() {
loss_base=10.579057693
ips_base=19822
mem_base=1709.8
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -544,7 +544,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage2() {
loss_base=10.579057693
ips_base=20170
mem_base=1709.8
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -580,7 +580,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage3() {
loss_base=10.585316849
ips_base=15742
mem_base=1591.6
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -616,7 +616,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage1() {
loss_base=10.672568035
ips_base=19461
mem_base=1384.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -652,7 +652,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2() {
loss_base=10.672568035
ips_base=19652
mem_base=1384.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -689,7 +689,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2_pir() {
loss_base=10.672568035
ips_base=19652
mem_base=1384.7
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -725,7 +725,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3() {
loss_base=10.696336079
ips_base=16613
mem_base=1280.5
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -762,7 +762,7 @@ function gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3_pir() {
loss_base=10.696336079
ips_base=16613
mem_base=1280.5
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -844,7 +844,7 @@ function gpt_auto_sp_acc_check() {
ips_base=-1
mem_base=-1
echo "result: loss_spTrue=$loss loss_spFasle=$loss_base"
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -910,7 +910,7 @@ function llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1() {
loss_base=9.52110565
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -976,7 +976,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1() {
loss_base=9.42011833
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1042,7 +1042,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1() {
loss_base=9.44299471
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1108,7 +1108,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2() {
loss_base=9.45936012
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1176,7 +1176,7 @@ function llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2
loss_base=9.46707726
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1245,7 +1245,7 @@ function llama_static_auto_recompute_bs16_fp16_DP2-MP2-PP2-VPP2-Sharding2_stage2
loss_base=10.0859375
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1312,7 +1312,7 @@ function llama_dygraph_auto_bs8_fp32_DP2() {
loss_base=9.53389835
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1379,7 +1379,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2() {
loss_base=9.39066124
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1446,7 +1446,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
loss_base=9.38235474
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1514,7 +1514,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
loss_base=9.38256836
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1582,7 +1582,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
loss_md5_base=0ebf68698887b33b33a46518621cf412
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1652,7 +1652,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
loss_md5_base=6df87d01bd08113a92930f6349514b35
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1722,7 +1722,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
loss_md5_base=6cb4e151b35f026190df90ab240d9a95
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down Expand Up @@ -1792,7 +1792,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
loss_md5_base=e82a1f5668870d18a2d45b3ee0a25386
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
#check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

Expand Down
Loading