From 2cac6f8a104653c47164402bc42c83d5607b5c18 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Mon, 15 Jan 2024 14:59:10 +0800 Subject: [PATCH] Auto mixed precision no log (#4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [DimExpr] DimExpr support hash (#60471) * open warning with `paddle.utils.deprecated` (#60458) * open_warning * update unittest * update * fix typos * fix warning in test runner * uncomment * cleanup todo * using VisibleDeprecationWarning * update comment * fix typo * fix indentation * fix * fix * fix indent level and test * update --------- Co-authored-by: SigureMo * [AutoParallel] Auto Trans PP to VPP (#60467) * [AutoParallel] Auto Trans PP to VPP * add comment * 【PIR OpTest Fix No.23】 fix test_distribute_fpn_proposals_op (#60335) * fix * fix * fix test_lookup_table_v2_bf16_op (#60332) * Fix shape error in combined-indexing setitem (#60447) * add ut * fix shape error in combine-indexing * fix ut * [auto parallel] Add pp lazy init, bug fix for xavier (#60441) * [PIR] add slice_array_dense api (#60433) * fix * fix * Set value with scalar (#60452) * set_value with scalar * fix ut * [PIR]Support custom op in PIR (#59790) * support custom op in pir * fix compile bugs * fix bugs * delete code * fix windows bugs * fix windows bugs * add symbol to paddle lib * fix windows bugs * revert code * fix bugs * fix bugs * perfect code according comment * fix py3 * revert third party * fix bugs * fix bug * fix compile bugs * fix windows * [Prim][PIR] support roll, gather, scatter, scatter_nd_add op backward in pir prim (#60481) * prim gather op backward * prim scatter op backward * prim roll op backward * prim scatter_nd op backward * [PIR] delete dense_tensor mem_desc_ (#60024) * delete dense_tensor mem_desc_ * [PIR] Complement op defs (#60475) * complement translation of legacy matmul * Complement op mappings in translation for deformable_conv_v1. * [pir]Supporting constant_folding_pass for train (#60355) * [pir]Supporting constant_folding_pass for train * fix * Update constant_folding_pass.cc * [Dynamic Shape] Fuse shape ops into generate shape op pass (#60490) * add shape.generate_shape op * rename shape.generate_shape to cinn_op.generate_shape * refactor GenerateShapeOp::SymbolBinding * move GenerateShapeOp related helper functions into generate_shape_util.cc * minor fix * minor fix * backup * refine signature of ConvertDimExprToAttribute * minor fix for signature of ConvertDimExprToAttributes * remove SubstituteDimExpr from generate_shape_util.h * Fix compile error * Fix unittest compile error * Code format * Code format * Fix _hiden_size to _hidden_size (#60485) * [DimExpr] Add substitute DimExpr util (#60493) * add SubstituteDimExpr * Fix compile error * Code format * Polish DimExprUtilTest * Change namesapce * Fix unittest * Polish DimExprUtilTest * [xpu]add sine_pos fuse pass and sine_pos xpu kernel (#60025) * add split with variable in factors and rewrite vectorize,unroll,bind error handling mechanism (#60449) * [CodeStyle] Fix regression of Ruff in sot (#60483) * support cast op from FP32 to low precision (#60385) * test=document_fix (#60399) * [XPU] refine flash attention ut (#60474) * [XPU] refine flash attention ut * refine tolerance * [Inference] support collect shape in sub block (#60451) * support collect shape in sub block * udpate * udpate * fix process mesh incorrect set in converter (#60504) * 【CMake opt No.13】Remove CINN DEPS in test/cpp/pir/shape_dialect/CMakeLists.txt (#60517) * Update CMakeLists.txt * Apply suggestions from code review * Apply suggestions from code review * Update CMakeLists.txt * Update CMakeLists.txt * 【pir】 add tensorarray op createarrylike, add_n (#60460) * optimize backward * [PIR] add vjp interface for while op * [PIR] fix ci error. * modify while stopgradient * merge * modify while grad bug * modify while grad op * modify * increment vp * [PIR] add get_used_external_value interface for block. * while case * delete print * delete print * Update python/paddle/autograd/ir_backward.py * [PIR] add unit_test for get_used_external_value * modify while_loop * code_style * modofy ci bug * modify while api * modify ci * modify array * Update python/paddle/autograd/ir_backward.py * Update test/legacy_test/test_cond.py * update * modify array_write grad info * merge * add_n and createarraylike * conflict * modify exe bug * modify kernel choose --------- Co-authored-by: winter-wang <1030748926@qq.com> * Add align iter space tactic (#60498) Add align iter space tactic * [Dynamic Shape] Add helper function MakeGenerateShapeOpAttribute (#60512) * add helper function MakeGenerateShapeOpAttribute * fix complier complaint * Code format * [Prim][PIR] Set prim gflag for pure cpp (#60505) * inference support decomp * polish code * add decomp base define * add decomp base define2 * change decomp infer * fix symbol overload * fix test case * debug * debug * decomp add debug info * add cpp flag * revert * remove unused flag * [PIR] Refine and fix pir exe (#60443) * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * update 2023 security advisory, test=document_fix (#60527) * [Inference] refine common/*.h for inference lib (#60513) * 【complex op】No.19 add complex support for triangular_solve (#59529) * fix reshard dist_attr (#60535) * 【auto parallel】剔除切分推导相关的头文件对proto 的依赖 (#60543) * decouple proto * format * format * strcuct pre def * [PIR] Support Operation::Clone Interface (#60536) * [PIR] Support Operation::Clone Interface * modify into shared_ptr * [Dynamic Shape] Add FullyInsertBroadcastPass and Broadcast Op (#60511) * add ShapeBroadcastOp * add pass FullyInsertBroadcastPass * InferSymbolicShape of BroadcastShape Op * Delete unit test * Fix return error * Code format * Fix error message * Update paddle/cinn/hlir/dialect/operator/transforms/fully_insert_broadcast_pass.cc Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> --------- Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> * Fix OpTranslatorTest name (#60518) * fix name * fix name * fix name * fix name * [PIR] migrate DataFeeder into pir (#60434) * 【PIR API adaptor No.90,92】Migrate some ops into pir (#59801) * [DimExpr] Convert Broadcast to BroadcastTree (#60440) * backup BroadcastTree * add SubstituteDimExpr * add helper function ConstructBroadcastTree * Fix compile error * Code format * Polish DimExprUtilTest * Add cmake file * Change namesapce * Fix compile error * Fix unittest * reconstruct BroadcastTree * Polish DimExprUtilTest * Reconstruct BroadcastTree * Finish BroadcastBranch * Finish BroadcastBranch * Finish BroadcastBranch * Add Unittest * Remove unnecessary dim_expr_util * Add header file * [Dynamic Shape] Erase expand (#60525) * EraseExpandOp * minor fix * minor fix * Code format * [inference] Support wint4 groupwise with cutlass gemm (#60422) * support gemv-groupwise func && weightQuanter-groupwise && weightDeQuanter-groupwise * fix build bug * add unit_test && fix bug * delete useless code * fix ci build bug * fix ci && optimize * fix merge conflict * add op change info * fix weight_only_linear_pass * fix format * solve ci unit_test * init * support cutlass gemm with groupwise * add unit test * fix strange bug * delete random bug * fix sm70 build bug * try to fix ci build bug * fix bug * fix volta build bug * skip sm70 in groupwise mode * change cutlass branch * simplify extent of loop after fuse and add corresponding test case (#60538) * fix bug of put_along_axis (#60551) * remove clearPass to allow custom device use fusion under fp16 (#60541) * fix fleetutil get_online_pass_interval bug2; test=develop (#60544) * fix vs2017 limit (#60528) * 【Hackathon 5th No.20】为 Paddle 新增 Exponential 和 Gamma API (#57899) * add exponential * add gamma distribution * refine docs * add kl_divergence and test * resolve conflicts * resolve conflicts * fix bug * refine test * fix test timeout * refine code * add standard_gamma kernel * fix comments * fix tests * fix tests * fix comments * fix tests * fix gamma grad * fix yaml * fix bugs * fix tests * fix standard_gamma_grad * fix test * fix test * add cdf & icdf * add cdf & icdf * refine comments * fix * fix * fix head file * fix * fix cuda op * fix * fix * refine test * fix test * refine comments * fix comments * fix * fix * fix type check * fix docs * delete useless comments * [CINN] Add IntrinsicOps into ir_codes_collector (#60556) This PR fixed a bug of running Resnet PaddleClas. The bug is due to vectorize introduce an intrinsic GetAddr and we didn't collect the tensor of GetAddr in ir_node_collector, this would caused tensor alias won't create in cuda code. TODO: we may modify IntrinsicOp in the near future * 【auto parallel】custom op spmd rule register (#60509) * custom op spmd rule register * custom op spmd rule register * custom op spmd rule register * custom op spmd rule register * polish * 【AutoParallel】Add master grad in AMP-O2 of AutoParallel (#59987) * add master_grad in auto-parallel * reset third_party * fix coverage * support bf16 master_grad * fix bug in master_grad * change code according to review * change the way to find optimizer op * [Dy2St] Fix `NameloadJstTransformer` missing transform call kwargs (#60515) --------- Co-authored-by: gouzil <66515297+gouzil@users.noreply.github.com> * cinn(backends): generate infer shape kernel to infer shape of output tensor (#60519) 通过二维指针来返回后端infer shape的结果。生成的cinn ir如下。tensor_shape_args是一个二维指针。 infer_shape_set_value(0, 0, S1, tensor_shape_args) 表示将第0个output tensor的第0维设置为S1。 * fix tensor math method inplace converter (#60546) * [xpu]Add vis_decoder_attention_xpu_pass && modify qkv_attention_xpu_kernel (#60361) * [Prim][PIR] support abs, instance_norm op backward in prim pir (#60444) * abs op backward * add test case * update code * update code * update code * update code * update code * instance_norm op backward * add instance_norm_v2 test cast * custom op * [PIR] remove log simply name mechnism from phi to common. (#60507) * [InferSymbolicShape] Delete redundent value_id_to_shapeordata_ (#60554) * 【Hackathon 5th No.25】add gammaln api (#60553) * fix (#60570) * [CINN] Add tile tactic and bind cuda tactic (#60534) * [CINN] Add tile tactic * [CINN] Add bind cuda tactic * 【PIR OpTest Fix No.8】 fix test_shuffle_batch_op (#59631) * fix test_shuffle_batch_op * fix * 【PIR OpTest Fix No.14】 fix test_nce (#60255) * fix test_nce * fix test_nce * Update ops.yaml * fix * Update utils.cc * Update ops.yaml * 【PIR OpTest Fix No.19】 fix test_ftrl_op (#60329) * fix test_ftrl_op * fix * [auto parallel] Lazy init for MP. Add reshard infer shape. (#60563) * [PIR] Add unittest for Operation::Clone and Group::Clone (#60577) * [PIR] dce pass disable custom op (#60578) * [Inference] Fix bug of RunWithExternalStream API in new executor (#60122) * fix bug of RunWithExternalStream API in new executor * add test * fix bug of RunWithExternalStream API in new executor * reset flage in RunWithExternalStream * fix bug * add param swith_stream * fix bug * modify python api * fix bug * Resubmit PR-58859 (#60310) * allow multiple rng state in generator * Fix 60142; Fix some comments from sneaxiy * Overwrite copy constructors * add api * pre-commit * tensor_array slice in PIR (#60503) * use slice_array, now will meet error of destory opresult still in use * disable the pir test until the bug fixed * Set DistModel state_dict keys to structure_names (#60478) * exclude xpu * check structure name mapping * test pp * polish * support dynamic save static load * support dygraph save static load * polish * polish * use structured_name as key in DistModel state_dict * polish * polish * fix checkpoint path conflict * test get_rank_to_files * static save dynamic load test * fix sm75 build bug (#60583) * replace LOG(INFO) with VLOG(6) * Add CanProveDivisible for symbolic calculation (#60572) * add CanProveDivisible for symbolic calculation * delete extra cout for debug * fix according to some comments * [PIR][DynamicShape] make shape pass default and fix some bugs (#60548) att, make shape pass default and fix some bugs * Fix words (#60603) * 【auto parallel】custom op use spmd rule (#60571) * custom op use smpd rule * custom op use smpd rule * [auto parallel] add lazy init ut to llama (#60585) * 【pir】 modify array_write and array_read vjp , add a simple while with array_write (#60575) * optimize backward * [PIR] add vjp interface for while op * [PIR] fix ci error. * modify while stopgradient * merge * modify while grad bug * modify while grad op * modify * increment vp * [PIR] add get_used_external_value interface for block. * while case * delete print * delete print * Update python/paddle/autograd/ir_backward.py * [PIR] add unit_test for get_used_external_value * modify while_loop * code_style * modofy ci bug * modify while api * modify ci * modify array * Update python/paddle/autograd/ir_backward.py * Update test/legacy_test/test_cond.py * update * modify array_write grad info * merge * add_n and createarraylike * conflict * modify array_write vjp * modify array_write vjp * Update paddle/fluid/pybind/manual_static_op_function.h * modify array_write vjp * modify ci bug * modify * modify * Update test/legacy_test/test_while_loop_op.py * modify inplace array_read * Update test/legacy_test/test_while_op.py * Update test/ir/pir/test_while_api.py --------- Co-authored-by: winter-wang <1030748926@qq.com> * [Prim][PIR] add leaky_relu, sigmoid, instance_norm op forward prim (#60564) * hardswish op prim sink * hardswish op prim * add composite * add leaky_relu, sigmoid op forward prim * remove hardswish op forward * add instance_norm op forward prim * [CINN]Add bucket context (#60549) * [CINN] Add tile tactic * [CINN] Add bind cuda tactic * [CINN] Add bucket contexts * fix group output args bug * Add CUDNNv8 max pooling (#59413) * Add CUDNNv8 version of pool2d * Minor fix * Fix build failure * Remove dygraph API * Fix CI failure * Fix CI failure * Fix timeout * Fix timeout * Add comments * Minor fix * update lbfgs to avoid the randomness caused by paddle.dot() temporarily (#60591) * update lbfgs to avoid the randomness caused by paddle.dot() temporarily * add note * set_pir_tests_properties for some tests (#60401) * fix * Update CMakeLists.txt * Update pir_op_test_white_list * Update pir_op_test_white_list * Update pir_op_test_white_list * Add tests to whitelist (#60522) * fix * add * fix double grad without convert inplace (#60614) * fix fleetutil get_online_pass_interval bug3 (#60615) * fix fleetutil get_online_pass_interval bug3; test=develop * fix fleetutil get_online_pass_interval bug3; test=develop * fix fleetutil get_online_pass_interval bug3; test=develop * [PIR][DynamicShape] Add an example for broadcast in dynamic shape infer (#60608) * Add an example for broadcast in dynamic shape infer * fix_convert_all_blocks (#60613) * fix_convert_all_blocks * [Paddle-TRT] support set_value dynamic shape (#60508) [Paddle-TRT] support set_value dynamic shape (#60508) * fix (#60625) * [PIR] Support Region Clone in Operation::Clone (#60590) * deg2rad test passed (#60619) * [PIR+CINN]Fix Pool2d Variant Attibute for kernel_size (#60623) * [PIR+CINN]Fix Pool2d Variant Attibute for kernel_size * fix padding_size * fix pooling_type * [SOT] move_gpu_pinned_to_gpu (#60395) * PIR API adaptor No.35、40】 Migrate paddle.nn.ChannelShuffle/ClipGradByNorm into pir (#60445) * fix some bugs * fix bugs * Update clip.py * Update test_channel_shuffle.py * Update test_clip_by_norm_op.py * Update test_clip_by_norm_op.py * add param name for dist_tensor parameter (#60574) * Fix (#60631) * [PIR] Reify InferSymbolicShapeInterface (#60438) * Reify InferSymbolicShapeInterface * [Dynamic Shape] Remove ShapeBroadcastOp redundant codes (#60609) * [Dy2St] fix `test_grad` in PIR mode (#60621) --------- Co-authored-by: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> * reconstruct llama ci cases (#60637) * 【AutoParallel】Unify the fp16 and bf16 in auto-parallel (#60514) * unify the fp16 and bf16 * change white_list in AMP * add dtype support * fix bug in dtype * [Dynamic Shape] Add SplitGenerateShapeIntoShapeOpsPass (#60624) * [Dynamic Shape] Add SplitGenerateShapeIntoShapeOpsPass * Fix compile error * Fix compile error * update pdsa-2023-019, test=document_fix (#60646) * [SOT] sot export test files (#60547) * Improve the performence of put_along_axis (#60618) * fix bug of put_along_axis * improve performence of put_along_axis * [AutoParallel] Fit vpp for gradient_merge pass (#60560) * add dist attr * add op namescope * add test_semi_auto_parallel_hybrid_strategy (#60537) * [PIR]Open uts for AdaptiveAvgPool3D (#60636) * test (#60654) * [CINN] Add OptimizeReductionTactic (#60661) * [Paddle-Trt]update set_value cmakelist (#60664) [Paddle-Trt]update set_value cmakelist * [auto parallel] fix reshape infer shape (#60632) * [CINN+PIR]Clean Old GroupScheduler logic and switch into new_group_scheduler (#60642) * [CINN]Fix HasDynamicShape Bug while Type is NULL (#60658) * [PIR] pir onednn support legact istruction and lrn (#60502) * pir onednn support legact istruction and lrn * c_softmax_with_cross_entropy support bf16 for xpu (#60472) * enable custom device to use silu_fuse_pass (#60595) move SetUseCustomDevice to all platform * [XPU] add empty_like op and test, update XHPC to 20240105 (#60617) * [XPU] update XHPC date and refine FA ut (#60598) * [XPU] update XHPC date * update comments for ut * correct adamw bf16 unit test and the way to get data type (#60565) * Fix some PADDLE_THROW error type and change test cases (#60487) * fix error type * fix TypeError fix type fix fix fix fix * fix typo * as_complex as_real check_grad (#60666) * [Fix Bug] Fix Bugs of Two Pass (#60626) * [Fix Bug] Fix Bugs of Two Pass * Fix GenerateShapeOp bug * Modify unit test * Fix MakeGetterDimExpr4SymbolName * 【Hackathon 5th No.34】为 Paddle 新增 bitwise_right_shift / bitwise_right_shift_ / bitwise_left_shift / bitwise_left_shift_ API (#58092) * This PR enable offset of generator for custom device. (#60616) * [SOT] Convert dtype to `DataType` in PIR mode (#60627) * [PIR] Change output to block_arg from copy to a shared for the execution of while (#60607) * test * fix * fix * fix * 【auto parallel】custom op spmd infer add args check (#60633) * add bound check * add bound check * [PIR] Open PIR flag for test_ifelse (#60685) * open pir flag for test_ifelse * Update test_ifelse.py * Update test_ifelse.py * [CIN+PIR]Fix SplitOpPattern Bug in pd_to_cinn_pass (#60669) * [CIN+PIR]Fix SplitOpPattern Bug in pd_to_cinn_pass * fix index error * refine pir_all_path UT * fix bug * fix uncontiguous tensor resize bug (#60684) * fix uncontiguous tensor resize bug * [PIR]Support inplace custom op in pir (#60529) * support inplace in pir * fix inference ut * fix win bugs * fix win bug * fix * polish code * polish code * print log * print log * debug * fix win bugs * fix windows * fix (#60634) * [Docs] Update latest release version in README (#60691) * [CINN] Refine cmake for pass in cinn (#60683) * refine cmake for pass in cinn * add dependency in cmake * add dependency in cmake * [PIR]Open uts for PReLU (#60645) * [PIR]Open uts for ReLU6 (#60650) * [PIR]Open uts for RReLU (#60660) * [NPU] fix storage_properties type mismatch with OneDNN and NPU (#60566) * fix ttfnet_darknet53_1x_coco in pir mode (#60663) * [auto parallel] shard tensor stop gradient support (#60699) * [PIR][DynamicShape] Polish some codes (#60651) att, polish some codes * [PIR] fix onednn double reg (#60720) * fix onednn double reg * 【pir】modify add_n in while use blockarg instead of input value (#60668) * test * fix * fix * fix * modify add_n block_arg * modify increment return value * merge * modfiy whiel_op.py --------- Co-authored-by: zhangbo9674 * [PIR] Open test_case ut (#60721) * fix * fix * [PIR] rename data_layout (#60678) * rename data_layout * [xpu]: check op is null (#60656) * 【Hackathon 5th No.1】 为 Paddle 新增 copysign API (#57785) * add copysign op * fix codestyle * codestyle * fix test * fix std bug * merge init * merge init * merge init * add static cast * add std * static cast * static cast * copysignf * static cast to float input * float input * static cast to double input * fix * add inplace test * fix api * fix cast when grad * modify paddle.cast_ to cast_ * remove cast in python api * support fp16 && bf16 * set grad y to zero * fix en doc * support number input * add hostdevice * refactor kernel * fix nan when backward * add broadcast unit test * modify .cu * Update __init__.py * Update __init__.py * for ci test * static float * codestyle * static double * fix broadcast, try coverage * Delete paddle/phi/kernels/funcs/broadcast_function.h * remove unused * Update math.py * Update math.py * fix en doc * add test for output dtype, integer unsupported for now * update * update * fix * fix * add cast for input * fix * add pir test * fix doc * fix doc * fix doc * detail doc * adjust for MSVC * fix * Update python/paddle/tensor/math.py Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com> * Update python/paddle/tensor/math.py Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com> * fix doc output dtype, fix Equation * codestyle * codestyle * Update math.py --------- Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com> * rms_norm_infer_spmd (#60709) * [PIR]Open more tests for bernoulli and celu (#60706) * bernoulli && celu * celu test_error * [PIR]Open uts for scatter_nd_add (#60698) * [PIR]Open uts for scatter_nd_add * Fix ut * [PIR]Open uts for sinh (#60714) * [PIR]Open uts for Softshrink and Softsign (#60716) * [PIR] polish the ir_mapping implimentation. (#60675) * [PIR] fix onednn layout transform yaml format (#60680) * fix onednn layout transform yaml format * 【CINN】Complete error handler mechanism of dynamic schedule (#60718) * complete error handler mechanism of dynamic schedule * fix some output info * fix windows C++17 bug (#60736) * [XPU] fc pass and delete pass nodes check (#60314) * fix_local_windows_compile (#60682) * [PIR] fix onednn dialect name (#60665) * fix onednn dialect name * 【pir】add tesnor to array kernel etc (#60703) * merge * modfiy kernel * modify net * modify print * Fix defition definition (#60679) * cholesky and cholesky_solve tests (#60726) * [PIR]Open uts for searchsorted (#60700) * [PIR]Open uts for selu (#60702) * [PIR]Open uts for selu * Fix ut * [PIR]Open uts for sequence_mask (#60704) * [PIR] adjust pir pass log printing (#60723) * adjust pir pass log printing * update * update * update * fix compile * Fix Throughtput Throughput (#60741) * please last md (#60749) * [CINN+PIR]Fix Fetch XShape Variable logic (#60722) * [PIR][DynamicShape] Remove redundant code for shapeAnalysis and shapedTypeInterface (#60744) att, remove redundant code for shapeAnalysis and shapedTypeInterface * 【PIR Dist Op Reg No.1】 reg push_sparse_v2 (#60473) * code reg push_sparse_v2 * [Dynamic Shape] Provide operator<< For BroadcastTree (#60730) * [PIR] change IR clone to const and support clone operation successors (#60752) * support ir clone const and support clone operation successors * refine ir_mapping * refine region clone * [CINN] Refine fully_insert_broadcast_pass (#60676) * refine fully_insert_broadcast_pass * fix complie bug * fix complie * fix conflict * [PIR] einsum's inner_cache and xshape set to optional (#60748) * einsum's inner_cache and xshape set to intermediate * Update paddle/fluid/pir/dialect/operator/ir/ops.yaml --------- Co-authored-by: kangguangli * reduce runtime of unit-tests in windows-trt (#60731) * modify trt test to deal with Timeout * windows * [Paddle-TRT] upgrade EnqueueV2 to EnqueueV3 (#59950) * 【Hackathon 5th No.110】为 Paddle 增强 sparse.matmul API (#59890) * Fix rank_relatvie rank_relative (#60770) * add graph_key to specific graph's varmap (#60567) * add graph_key to specific graph's varmap * fix inpalce case * fix inpalce case * 【Hackathon 5th No.38】为 Paddle 新增 FractionalMaxPool2d / FractionalMaxPool3d API -kernel (#59847) * [Init] add fractional max pool kernel and api * [Fix] pooling.cu seed offset * [Change] remove adaptive from fractional max pool * [Change] fractional max 2d gpu pooling.cu grad * [Change] fractional max 2d gpu pooling.cu grad with dim3 * [Change] use UnchangedInferMeta * [Change] test api with uint16 * [Change] wrap test disable_static * [Change] regiester float16/bfloat16 * [Change] remove bfloat16 from cpu kernrl * [Change] test dtypes in cpu and gpu * [Change] test_fractional_max_pool3d_2d/3d timeout to 30s * [Fix] resolve conflict * [Change] win32 cannot detect bfloat16 correctly * [Change] force set_device * [Add] test random_u is None * [Change] use kernel_size for overlapping mode * [Change] clean headers * [CodeStyle] pooling * [Change] rename op * [Change] rename func without index * [Prim][PIR] Recover pir bn (#60689) * reopen bn prim pir * fix atol * decomp support batch_norm_ * fix test case * fix bug * fix code * [PIR]fc_with_special_op_fuse_pass bug fix (#60751) * bug fix update * update * delete all debug message * add code deleted wrong at last commit * delete createAutoMixedPrecisionPass in analysis_predictor.cc --------- Co-authored-by: HongyuJia Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> Co-authored-by: SigureMo Co-authored-by: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Co-authored-by: xingmingyyj <135400902+xingmingyyj@users.noreply.github.com> Co-authored-by: JYChen Co-authored-by: Yuang Liu Co-authored-by: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Co-authored-by: YuanRisheng Co-authored-by: kevin Co-authored-by: wanghuancoder Co-authored-by: kangguangli Co-authored-by: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com> Co-authored-by: co63oc Co-authored-by: NeroLoh <745827440@qq.com> Co-authored-by: 傅剑寒 Co-authored-by: lzydev Co-authored-by: tianshuo78520a <707759223@qq.com> Co-authored-by: houj04 <35131887+houj04@users.noreply.github.com> Co-authored-by: Yuanle Liu Co-authored-by: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Co-authored-by: 张春乔 <83450930+Liyulingyue@users.noreply.github.com> Co-authored-by: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Co-authored-by: winter-wang <1030748926@qq.com> Co-authored-by: BiynXu <62832681+BiynXu@users.noreply.github.com> Co-authored-by: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com> Co-authored-by: Vigi Zhang Co-authored-by: zbt78 <1095497213@qq.com> Co-authored-by: liuzhenhai93 Co-authored-by: Aurelius84 Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Co-authored-by: Lu Qi <61354321+MarioLulab@users.noreply.github.com> Co-authored-by: LoneRanger <836253168@qq.com> Co-authored-by: freeliuzc Co-authored-by: YibLiu <68105073+YibinLiu666@users.noreply.github.com> Co-authored-by: engineer1109 Co-authored-by: danleifeng <52735331+danleifeng@users.noreply.github.com> Co-authored-by: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com> Co-authored-by: MayYouBeProsperous Co-authored-by: Huihuang Zheng Co-authored-by: gouzil <66515297+gouzil@users.noreply.github.com> Co-authored-by: 6clc Co-authored-by: Terry <38135104+TR666@users.noreply.github.com> Co-authored-by: winter-wang <78149749+winter-wang@users.noreply.github.com> Co-authored-by: Wang Xin Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com> Co-authored-by: Frank Lin Co-authored-by: pangengzheng <117730991+pangengzheng@users.noreply.github.com> Co-authored-by: lanxianghit <47554610+lanxianghit@users.noreply.github.com> Co-authored-by: Tian Zheng Co-authored-by: lijialin03 <124568209+lijialin03@users.noreply.github.com> Co-authored-by: Wangzheee <634486483@qq.com> Co-authored-by: zhink <33270771+zhink@users.noreply.github.com> Co-authored-by: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Co-authored-by: Chen Zhiyang <1792266893@qq.com> Co-authored-by: feifei-111 <2364819892@qq.com> Co-authored-by: fsczz <57291768+fsczz@users.noreply.github.com> Co-authored-by: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Co-authored-by: Sonder <55493212+AndSonder@users.noreply.github.com> Co-authored-by: Liujie0926 <44688141+Liujie0926@users.noreply.github.com> Co-authored-by: WangZhen <23097963+0x45f@users.noreply.github.com> Co-authored-by: risemeup1 <62429225+risemeup1@users.noreply.github.com> Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com> Co-authored-by: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Co-authored-by: Jianbang Yang Co-authored-by: enzodechine Co-authored-by: Zhan Rongrui <46243324+zrr1999@users.noreply.github.com> Co-authored-by: coco <69197635+cocoshe@users.noreply.github.com> Co-authored-by: zhaohaixu <49297029+zhaohaixu@users.noreply.github.com> Co-authored-by: chen2016013 <111894720+chen2016013@users.noreply.github.com> Co-authored-by: zyfncg Co-authored-by: Qi Li Co-authored-by: zhangbo9674 Co-authored-by: Liuyinfeng <30849840+gitliuyf@users.noreply.github.com> Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com> Co-authored-by: wendaxiao <113992173+wenxiaohahaha@users.noreply.github.com> Co-authored-by: cyberslack_lee Co-authored-by: lizexu123 <39205361+lizexu123@users.noreply.github.com> Co-authored-by: GGBond8488 <33050871+GGBond8488@users.noreply.github.com> Co-authored-by: megemini --- .../fluid/inference/api/analysis_predictor.cc | 10 ---- .../transforms/auto_mixed_precision_pass.cc | 50 ++----------------- 2 files changed, 3 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index de6a2d0c97189..a8c73c3218398 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -105,7 +105,6 @@ #endif #include "paddle/fluid/ir_adaptor/translator/translate.h" -#include "paddle/fluid/pir/transforms/auto_mixed_precision_pass.h" #include "paddle/fluid/pir/transforms/constant_folding_pass.h" #include "paddle/fluid/pir/transforms/dead_code_elimination_pass.h" #include "paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.h" @@ -807,15 +806,6 @@ bool AnalysisPredictor::PrepareExecutor() { //----------------------------------------------------------------------------------------------// // Functional pass - // Do auto mixed precision pass first, so do not need to handle - // shadowoutput. - auto auto_mixed_precision_pass = ::pir::CreateAutoMixedPrecisionPass(); - auto_mixed_precision_pass->SetNotOwned(pir::kPlaceAttr, &place_); - phi::DataType data_type = - ConvertPrecision(config_.mixed_precision_mode_); - auto_mixed_precision_pass->SetNotOwned("__mixed_precision_mode__", - &data_type); - gpu_pm.AddPass(std::move(auto_mixed_precision_pass)); gpu_pm.AddPass(::pir::CreateIdentityOpCleanPass()); //----------------------------------------------------------------------------------------------// diff --git a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc index 6846ccc6b5753..0bf137bb09a23 100644 --- a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc @@ -82,20 +82,10 @@ class AutoMixedPrecisionPass : public pir::Pass { for (size_t i = 0; i < op->num_regions(); ++i) { auto& region = op->region(i); for (auto& block : region) { - VLOG(6) << "===========Get Op Precision============" << std::endl; GetOpPrecision(&block); - VLOG(6) << "===========Update Op Precision============" << std::endl; UpdateOpPrecision(&block); - - VLOG(6) << "===========" << op_run_low_precision_.size() << " of " - << block.size() << " ops" - << " run low precision" << std::endl; pir::Builder builder = pir::Builder(context_, &block); - VLOG(6) << "===========Process Op Precision============" << std::endl; - ProcessBlock(&block, builder); - VLOG(6) << "===========Insert Cast Op Num : " << insert_cast_op_num_ - << "============" << std::endl; } } } @@ -144,7 +134,6 @@ class AutoMixedPrecisionPass : public pir::Pass { void GetOpPrecision(pir::Block* block) { for (auto& op_item : *block) { auto op = &op_item; - VLOG(6) << "op name " << op->name(); auto op_name = op->name(); bool support_low_precision = true; if (black_list_.count(op_name)) { @@ -167,10 +156,6 @@ class AutoMixedPrecisionPass : public pir::Pass { } if (support_low_precision) { op_run_low_precision_.insert(op); - VLOG(6) << "op " << op->name() << " support low precision" << std::endl; - } else { - VLOG(6) << "op " << op->name() << " doesn't support low precision" - << std::endl; } } } @@ -235,8 +220,6 @@ class AutoMixedPrecisionPass : public pir::Pass { } if (!OpRunLowPrecision(op)) continue; if (CheckOutputIsScalarAttribute(op)) { // Output is ScalarAttribute - VLOG(6) << "op " << op->name() << " output is ScalarAttribute" - << std::endl; op_run_low_precision_.erase(op); precision_updated = true; } @@ -261,21 +244,10 @@ class AutoMixedPrecisionPass : public pir::Pass { } } } while (precision_updated); - for (auto& op_item : *block) { - auto op = &op_item; - if (op_should_not_handle_.count(op)) { - VLOG(6) << "op " << op->name() << " should not be handled" << std::endl; - } else if (op_run_low_precision_.count(op)) { - VLOG(6) << "op " << op->name() << " run low precision" << std::endl; - } else { - VLOG(6) << "op " << op->name() << " run high precision" << std::endl; - } - } } void RewriteOp(pir::Operation* op, pir::Builder& builder) { // NOLINT - VLOG(6) << "Rewrite op " << op->name() << std::endl; if (IsBuiltinOp(op)) { RewriteBuiltinOp(op, builder); return; @@ -318,7 +290,6 @@ class AutoMixedPrecisionPass : public pir::Pass { phi::DataType precision, phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) const { auto& phi_op_type = op_type; - VLOG(6) << "phi_op_type = " << phi_op_type << std::endl; bool support = PhiKernelSupportPrecision(phi_op_type, backend, precision, layout); @@ -419,8 +390,8 @@ class AutoMixedPrecisionPass : public pir::Pass { auto new_vec_type = pir::VectorType::get(context, results_type); result.set_type(new_vec_type); } else { - VLOG(6) << "result type is not DenseTensorType or VectorType" - << std::endl; + PADDLE_THROW(phi::errors::Unimplemented( + "result type is not DenseTensorType or VectorType")); } } @@ -452,7 +423,6 @@ class AutoMixedPrecisionPass : public pir::Pass { IsVectorTypeFloat(result.type().dyn_cast())) { } } - VLOG(6) << "op " << op->name() << " doesn't have float result" << std::endl; return false; } @@ -517,10 +487,8 @@ class AutoMixedPrecisionPass : public pir::Pass { void RewriteBuiltinOp(pir::Operation* op, pir::Builder& builder) { // NOLINT - VLOG(6) << "Rewrite builtin op " << op->name() << std::endl; // Rewrite CombineOp if (op->isa()) { - // auto vec_type = op->result(0).type().dyn_cast(); auto input_num = op->num_operands(); if (OpRunLowPrecision(op)) { for (size_t i = 0; i < input_num; ++i) { @@ -572,10 +540,8 @@ class AutoMixedPrecisionPass : public pir::Pass { void RewritePdOp(pir::Operation* op, pir::Builder& builder) { // NOLINT - VLOG(6) << "Rewrite pd op " << op->name() << std::endl; - phi::Backend backend = ConvertPlaceToBackend(place_); std::string op_type = op->name().substr(op->name().find(".") + 1); - + phi::Backend backend = ConvertPlaceToBackend(place_); // Rewrite FetchOp if (op->isa()) { auto fetch_operand = op->operand(0); @@ -587,7 +553,6 @@ class AutoMixedPrecisionPass : public pir::Pass { auto result_dtype = paddle::dialect::TransToPhiDataType( pir::GetDataTypeFromValue(op->result(0))); if (fetch_operand_phi_dtype != result_dtype) { - VLOG(6) << "Insert CastOp for FetchOp" << std::endl; DoInsertCastOp(op, fetch_operand, result_dtype, builder); } return; @@ -607,9 +572,6 @@ class AutoMixedPrecisionPass : public pir::Pass { // Other pd ops if (OpRunLowPrecision(op)) { // change result's dtype to low precision - VLOG(6) << "Change result's dtype to low precision " << op->name() - << std::endl; - if (op->HasAttribute("dtype") && IsPhiDataTypeFloat( op->attribute("dtype") @@ -644,8 +606,6 @@ class AutoMixedPrecisionPass : public pir::Pass { auto result = op->result(i); if (!result.type()) continue; phi::DataType out_phi_dtype = output_defs[i].dtype; - VLOG(6) << "result dtype = " << phi::DataTypeToString(out_phi_dtype) - << std::endl; if (out_phi_dtype == phi::DataType::UNDEFINED) out_phi_dtype = precision_mode_; if (!IsPhiDataTypeFloat(out_phi_dtype)) @@ -663,8 +623,6 @@ class AutoMixedPrecisionPass : public pir::Pass { auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); if (IsPhiDataTypeFloat(operand_phi_dtype) && operand_phi_dtype != in_phi_dtype) { - VLOG(6) << "Support low precision, insert CastOp for " << op->name() - << " operand " << i << std::endl; DoInsertCastOp(op, operand, in_phi_dtype, builder); } } @@ -677,8 +635,6 @@ class AutoMixedPrecisionPass : public pir::Pass { auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); if (IsPhiDataTypeFloat(operand_phi_dtype) && operand_phi_dtype == precision_mode_) { - VLOG(6) << "Not support low precision, insert CastOp for " - << op->name() << " operand " << i << std::endl; DoInsertCastOp(op, operand, phi_dtype, builder); } }