From 2cac6f8a104653c47164402bc42c83d5607b5c18 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Mon, 15 Jan 2024 14:59:10 +0800
Subject: [PATCH] Auto mixed precision no log (#4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [DimExpr] DimExpr support hash (#60471)

* open warning with `paddle.utils.deprecated` (#60458)

* open_warning

* update unittest

* update

* fix typos

* fix warning in test runner

* uncomment

* cleanup todo

* using VisibleDeprecationWarning

* update comment

* fix typo

* fix indentation

* fix

* fix

* fix indent level and test

* update

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>

* [AutoParallel] Auto Trans PP to VPP (#60467)

* [AutoParallel] Auto Trans PP to VPP

* add comment

* 【PIR OpTest Fix No.23】 fix test_distribute_fpn_proposals_op (#60335)

* fix

* fix

* fix  test_lookup_table_v2_bf16_op (#60332)

* Fix shape error in combined-indexing setitem (#60447)

* add ut

* fix shape error in combine-indexing

* fix ut

* [auto parallel] Add pp lazy init, bug fix for xavier (#60441)

* [PIR] add slice_array_dense api (#60433)

* fix

* fix

* Set value with scalar (#60452)

* set_value with scalar

* fix ut

* [PIR]Support custom op in PIR (#59790)

* support custom op in pir

* fix compile bugs

* fix bugs

* delete code

* fix windows bugs

* fix windows bugs

* add symbol to paddle lib

* fix windows bugs

* revert code

* fix bugs

* fix bugs

* perfect code according comment

* fix py3

* revert third party

* fix bugs

* fix bug

* fix compile bugs

* fix windows

* [Prim][PIR] support roll, gather, scatter, scatter_nd_add op backward in pir prim (#60481)

* prim gather op backward

* prim scatter op backward

* prim roll op backward

* prim scatter_nd op backward

* [PIR] delete dense_tensor mem_desc_ (#60024)

* delete dense_tensor mem_desc_

* [PIR] Complement op defs (#60475)

* complement translation of legacy matmul
* Complement op mappings in translation for deformable_conv_v1.

* [pir]Supporting constant_folding_pass for train (#60355)

* [pir]Supporting constant_folding_pass for train

* fix

* Update constant_folding_pass.cc

* [Dynamic Shape] Fuse shape ops into generate shape op pass (#60490)

* add shape.generate_shape op

* rename shape.generate_shape to cinn_op.generate_shape

* refactor GenerateShapeOp::SymbolBinding

* move GenerateShapeOp related helper functions into generate_shape_util.cc

* minor fix

* minor fix

* backup

* refine signature of ConvertDimExprToAttribute

* minor fix for signature of ConvertDimExprToAttributes

* remove SubstituteDimExpr from generate_shape_util.h

* Fix compile error

* Fix unittest compile error

* Code format

* Code format

* Fix _hiden_size to _hidden_size (#60485)

* [DimExpr] Add substitute DimExpr util (#60493)

* add SubstituteDimExpr

* Fix compile error

* Code format

* Polish DimExprUtilTest

* Change namesapce

* Fix unittest

* Polish DimExprUtilTest

* [xpu]add sine_pos fuse pass and sine_pos xpu kernel (#60025)

* add split with variable in factors and rewrite vectorize,unroll,bind error handling mechanism (#60449)

* [CodeStyle] Fix regression of Ruff in sot (#60483)

* support cast op from FP32 to low precision (#60385)

* test=document_fix (#60399)

* [XPU] refine flash attention ut (#60474)

* [XPU] refine flash attention ut

* refine tolerance

* [Inference] support collect shape in sub block (#60451)

* support collect shape in sub block

* udpate

* udpate

* fix process mesh incorrect set in converter (#60504)

* 【CMake opt No.13】Remove CINN DEPS in test/cpp/pir/shape_dialect/CMakeLists.txt	 (#60517)

* Update CMakeLists.txt

* Apply suggestions from code review

* Apply suggestions from code review

* Update CMakeLists.txt

* Update CMakeLists.txt

* 【pir】 add tensorarray op createarrylike, add_n (#60460)

* optimize backward

* [PIR] add vjp interface for while op

* [PIR] fix ci error.

* modify while stopgradient

* merge

* modify while grad bug

* modify while grad op

* modify

* increment vp

* [PIR] add get_used_external_value interface for block.

* while case

* delete print

* delete print

* Update python/paddle/autograd/ir_backward.py

* [PIR] add unit_test for get_used_external_value

* modify while_loop

* code_style

* modofy ci bug

* modify while api

* modify ci

* modify array

* Update python/paddle/autograd/ir_backward.py

* Update test/legacy_test/test_cond.py

* update

* modify array_write grad info

* merge

* add_n and createarraylike

* conflict

* modify exe bug

* modify kernel choose

---------

Co-authored-by: winter-wang <1030748926@qq.com>

* Add align iter space tactic (#60498)

Add align iter space tactic

* [Dynamic Shape] Add helper function MakeGenerateShapeOpAttribute (#60512)

* add helper function MakeGenerateShapeOpAttribute

* fix complier complaint

* Code format

* [Prim][PIR] Set prim gflag for pure cpp (#60505)

* inference support decomp

* polish code

* add decomp base define

* add decomp base define2

* change decomp infer

* fix symbol overload

* fix test case

* debug

* debug

* decomp add debug info

* add cpp flag

* revert

* remove unused flag

* [PIR] Refine and fix pir exe (#60443)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* update 2023 security advisory, test=document_fix (#60527)

* [Inference] refine common/*.h for inference lib (#60513)

* 【complex op】No.19 add complex support for triangular_solve (#59529)

* fix reshard dist_attr (#60535)

* 【auto parallel】剔除切分推导相关的头文件对proto 的依赖 (#60543)

* decouple proto

* format

* format

* strcuct pre def

* [PIR] Support Operation::Clone Interface (#60536)

* [PIR] Support Operation::Clone Interface

* modify into shared_ptr

* [Dynamic Shape] Add FullyInsertBroadcastPass and Broadcast Op (#60511)

* add ShapeBroadcastOp

* add pass FullyInsertBroadcastPass

* InferSymbolicShape of BroadcastShape Op

* Delete unit test

* Fix return error

* Code format

* Fix error message

* Update paddle/cinn/hlir/dialect/operator/transforms/fully_insert_broadcast_pass.cc

Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>

---------

Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>

* Fix OpTranslatorTest name (#60518)

* fix name

* fix name

* fix name

* fix name

* [PIR] migrate DataFeeder into pir (#60434)

* 【PIR API adaptor No.90,92】Migrate some ops into pir (#59801)

* [DimExpr] Convert Broadcast to BroadcastTree (#60440)

* backup BroadcastTree

* add SubstituteDimExpr

* add helper function ConstructBroadcastTree

* Fix compile error

* Code format

* Polish DimExprUtilTest

* Add cmake file

* Change namesapce

* Fix compile error

* Fix unittest

* reconstruct BroadcastTree

* Polish DimExprUtilTest

* Reconstruct BroadcastTree

* Finish BroadcastBranch

* Finish BroadcastBranch

* Finish BroadcastBranch

* Add Unittest

* Remove unnecessary dim_expr_util

* Add header file

* [Dynamic Shape] Erase expand (#60525)

* EraseExpandOp

* minor fix

* minor fix

* Code format

* [inference] Support wint4 groupwise with cutlass gemm (#60422)

* support gemv-groupwise func && weightQuanter-groupwise && weightDeQuanter-groupwise

* fix build bug

* add unit_test && fix bug

* delete useless code

* fix ci build bug

* fix ci && optimize

* fix merge conflict

* add op change info

* fix weight_only_linear_pass

* fix format

* solve ci unit_test

* init

* support cutlass gemm with groupwise

* add unit test

* fix strange bug

* delete random bug

* fix sm70 build bug

* try to fix ci build bug

* fix bug

* fix volta build bug

* skip sm70 in groupwise mode

* change cutlass branch

* simplify extent of loop after fuse and add corresponding test case (#60538)

* fix bug of put_along_axis (#60551)

* remove clearPass to allow custom device use fusion under fp16 (#60541)

* fix fleetutil get_online_pass_interval bug2; test=develop (#60544)

* fix vs2017 limit (#60528)

* 【Hackathon 5th No.20】为 Paddle 新增 Exponential 和 Gamma API (#57899)

* add exponential

* add gamma distribution

* refine docs

* add kl_divergence and test

* resolve conflicts

* resolve conflicts

* fix bug

* refine test

* fix test timeout

* refine code

* add standard_gamma kernel

* fix comments

* fix tests

* fix tests

* fix comments

* fix tests

* fix gamma grad

* fix yaml

* fix bugs

* fix tests

* fix standard_gamma_grad

* fix test

* fix test

* add cdf & icdf

* add cdf & icdf

* refine comments

* fix

* fix

* fix head file

* fix

* fix cuda op

* fix

* fix

* refine test

* fix test

* refine comments

* fix comments

* fix

* fix

* fix type check

* fix docs

* delete useless comments

* [CINN] Add IntrinsicOps into ir_codes_collector (#60556)

This PR fixed a bug of running Resnet PaddleClas.

The bug is due to vectorize introduce an intrinsic GetAddr and we didn't collect the tensor of GetAddr in ir_node_collector, this would caused tensor alias won't create in cuda code.

TODO: we may modify IntrinsicOp in the near future

* 【auto parallel】custom op  spmd rule register  (#60509)

* custom op spmd rule register

* custom op spmd rule register

* custom op spmd rule register

* custom op spmd rule register

* polish

* 【AutoParallel】Add master grad in AMP-O2 of AutoParallel (#59987)

* add master_grad in auto-parallel

* reset third_party

* fix coverage

* support bf16 master_grad

* fix bug in master_grad

* change code according to review

* change the way to find optimizer op

* [Dy2St] Fix `NameloadJstTransformer` missing transform call kwargs (#60515)

---------

Co-authored-by: gouzil <66515297+gouzil@users.noreply.github.com>

* cinn(backends): generate infer shape kernel to infer shape of output tensor (#60519)

通过二维指针来返回后端infer shape的结果。生成的cinn ir如下。tensor_shape_args是一个二维指针。 infer_shape_set_value(0, 0, S1, tensor_shape_args) 表示将第0个output tensor的第0维设置为S1。

* fix tensor math method inplace converter (#60546)

* [xpu]Add vis_decoder_attention_xpu_pass && modify qkv_attention_xpu_kernel (#60361)

* [Prim][PIR] support abs, instance_norm op backward in prim pir (#60444)

* abs op backward

* add test case

* update code

* update code

* update code

* update code

* update code

* instance_norm op backward

* add instance_norm_v2 test cast

* custom op

* [PIR] remove log simply name mechnism from phi to common. (#60507)

* [InferSymbolicShape] Delete redundent value_id_to_shapeordata_ (#60554)

* 【Hackathon 5th No.25】add gammaln api (#60553)

* fix (#60570)

* [CINN] Add tile tactic and bind cuda tactic (#60534)

* [CINN] Add tile tactic

* [CINN] Add bind cuda tactic

* 【PIR OpTest Fix No.8】 fix test_shuffle_batch_op (#59631)

* fix test_shuffle_batch_op

* fix

* 【PIR OpTest Fix No.14】 fix test_nce (#60255)

* fix test_nce

* fix test_nce

* Update ops.yaml

* fix

* Update utils.cc

* Update ops.yaml

* 【PIR OpTest Fix No.19】 fix test_ftrl_op (#60329)

* fix test_ftrl_op

* fix

* [auto parallel] Lazy init for MP. Add reshard infer shape. (#60563)

* [PIR] Add unittest for Operation::Clone and Group::Clone (#60577)

* [PIR] dce pass disable custom op (#60578)

* [Inference] Fix bug of RunWithExternalStream API in new executor (#60122)

* fix bug of RunWithExternalStream API in new executor

* add test

* fix bug of RunWithExternalStream API in new executor

* reset flage in RunWithExternalStream

* fix bug

* add param swith_stream

* fix bug

* modify python api

* fix bug

* Resubmit PR-58859 (#60310)

* allow multiple rng state in generator

* Fix 60142; Fix some comments from sneaxiy

* Overwrite copy constructors

* add api

* pre-commit

* tensor_array slice in PIR (#60503)

* use slice_array, now will meet error of destory opresult still in use

* disable the pir test until the bug fixed

* Set DistModel state_dict keys to structure_names (#60478)

* exclude xpu

* check structure name mapping

* test pp

* polish

* support dynamic save static load

* support dygraph save static load

* polish

* polish

* use structured_name as key in DistModel state_dict

* polish

* polish

* fix checkpoint path conflict

* test get_rank_to_files

* static save dynamic load test

* fix sm75 build bug (#60583)

* replace LOG(INFO) with VLOG(6)

* Add CanProveDivisible for symbolic calculation (#60572)

* add CanProveDivisible for symbolic calculation

* delete extra cout for debug

* fix according to some comments

* [PIR][DynamicShape] make shape pass default and fix some bugs (#60548)

att, make shape pass default and fix some bugs

* Fix words (#60603)

* 【auto parallel】custom op use spmd rule (#60571)

* custom op use smpd rule

* custom op use smpd rule

* [auto parallel] add lazy init ut to llama (#60585)

* 【pir】 modify array_write and array_read vjp , add a simple while with array_write (#60575)

* optimize backward

* [PIR] add vjp interface for while op

* [PIR] fix ci error.

* modify while stopgradient

* merge

* modify while grad bug

* modify while grad op

* modify

* increment vp

* [PIR] add get_used_external_value interface for block.

* while case

* delete print

* delete print

* Update python/paddle/autograd/ir_backward.py

* [PIR] add unit_test for get_used_external_value

* modify while_loop

* code_style

* modofy ci bug

* modify while api

* modify ci

* modify array

* Update python/paddle/autograd/ir_backward.py

* Update test/legacy_test/test_cond.py

* update

* modify array_write grad info

* merge

* add_n and createarraylike

* conflict

* modify array_write vjp

* modify array_write vjp

* Update paddle/fluid/pybind/manual_static_op_function.h

* modify array_write vjp

* modify ci bug

* modify

* modify

* Update test/legacy_test/test_while_loop_op.py

* modify inplace array_read

* Update test/legacy_test/test_while_op.py

* Update test/ir/pir/test_while_api.py

---------

Co-authored-by: winter-wang <1030748926@qq.com>

* [Prim][PIR] add leaky_relu, sigmoid, instance_norm op forward prim  (#60564)

* hardswish op prim sink

* hardswish op prim

* add composite

* add leaky_relu, sigmoid op forward prim

* remove hardswish op forward

* add instance_norm op forward prim

* [CINN]Add bucket context (#60549)

* [CINN] Add tile tactic

* [CINN] Add bind cuda tactic

* [CINN] Add bucket contexts

* fix group output args bug

* Add CUDNNv8 max pooling (#59413)

* Add CUDNNv8 version of pool2d

* Minor fix

* Fix build failure

* Remove dygraph API

* Fix CI failure

* Fix CI failure

* Fix timeout

* Fix timeout

* Add comments

* Minor fix

* update lbfgs to avoid the randomness caused by paddle.dot() temporarily (#60591)

* update lbfgs to avoid the randomness caused by paddle.dot() temporarily

* add note

* set_pir_tests_properties for some tests (#60401)

* fix

* Update CMakeLists.txt

* Update pir_op_test_white_list

* Update pir_op_test_white_list

* Update pir_op_test_white_list

* Add tests to whitelist (#60522)

* fix

* add

* fix double grad without convert inplace (#60614)

* fix fleetutil get_online_pass_interval bug3 (#60615)

* fix fleetutil get_online_pass_interval bug3; test=develop

* fix fleetutil get_online_pass_interval bug3; test=develop

* fix fleetutil get_online_pass_interval bug3; test=develop

* [PIR][DynamicShape] Add an example for broadcast in dynamic shape infer (#60608)

* Add an example for broadcast in dynamic shape infer

* fix_convert_all_blocks (#60613)

* fix_convert_all_blocks

* [Paddle-TRT] support set_value dynamic shape (#60508)

[Paddle-TRT] support set_value dynamic shape (#60508)

* fix (#60625)

* [PIR] Support Region Clone in Operation::Clone (#60590)

* deg2rad test passed (#60619)

* [PIR+CINN]Fix Pool2d Variant Attibute for kernel_size (#60623)

* [PIR+CINN]Fix Pool2d Variant Attibute for kernel_size

* fix padding_size

* fix pooling_type

* [SOT] move_gpu_pinned_to_gpu (#60395)

* PIR API adaptor No.35、40】 Migrate paddle.nn.ChannelShuffle/ClipGradByNorm into pir (#60445)

* fix some bugs

* fix bugs

* Update clip.py

* Update test_channel_shuffle.py

* Update test_clip_by_norm_op.py

* Update test_clip_by_norm_op.py

* add param name for dist_tensor parameter (#60574)

* Fix (#60631)

* [PIR] Reify InferSymbolicShapeInterface (#60438)

* Reify InferSymbolicShapeInterface

* [Dynamic Shape] Remove ShapeBroadcastOp redundant codes (#60609)

* [Dy2St] fix `test_grad` in PIR mode (#60621)


---------

Co-authored-by: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>

* reconstruct llama ci cases (#60637)

* 【AutoParallel】Unify the fp16 and bf16 in auto-parallel (#60514)

* unify the fp16 and bf16

* change white_list in AMP

* add dtype support

* fix bug in dtype

* [Dynamic Shape] Add SplitGenerateShapeIntoShapeOpsPass (#60624)

* [Dynamic Shape] Add SplitGenerateShapeIntoShapeOpsPass

* Fix compile error

* Fix compile error

* update pdsa-2023-019, test=document_fix (#60646)

* [SOT] sot export test files (#60547)

* Improve the performence of put_along_axis (#60618)

* fix bug of put_along_axis

* improve performence of put_along_axis

* [AutoParallel] Fit vpp for gradient_merge pass (#60560)

* add dist attr

* add op namescope

* add test_semi_auto_parallel_hybrid_strategy (#60537)

* [PIR]Open uts for AdaptiveAvgPool3D (#60636)

* test (#60654)

* [CINN] Add OptimizeReductionTactic (#60661)

* [Paddle-Trt]update set_value cmakelist (#60664)

[Paddle-Trt]update set_value cmakelist

* [auto parallel] fix reshape infer shape (#60632)

* [CINN+PIR]Clean Old GroupScheduler logic and switch into new_group_scheduler (#60642)

* [CINN]Fix HasDynamicShape Bug while Type is NULL (#60658)

* [PIR] pir onednn support legact istruction and lrn (#60502)

* pir onednn support legact istruction and lrn

* c_softmax_with_cross_entropy support bf16 for xpu (#60472)

* enable custom device to use silu_fuse_pass (#60595)

move SetUseCustomDevice to all platform

* [XPU] add empty_like op and test, update XHPC to 20240105 (#60617)

* [XPU] update XHPC date and refine FA ut (#60598)

* [XPU] update XHPC date

* update comments for ut

* correct adamw bf16 unit test and the way to get data type (#60565)

* Fix some PADDLE_THROW error type and change test cases (#60487)

* fix error type

* fix TypeError

fix type

fix

fix

fix

fix

* fix typo

* as_complex as_real check_grad (#60666)

* [Fix Bug] Fix Bugs of Two Pass (#60626)

* [Fix Bug] Fix Bugs of Two Pass

* Fix GenerateShapeOp bug

* Modify unit test

* Fix MakeGetterDimExpr4SymbolName

* 【Hackathon 5th No.34】为 Paddle 新增 bitwise_right_shift / bitwise_right_shift_ / bitwise_left_shift / bitwise_left_shift_ API (#58092)

* This PR enable offset of generator for custom device. (#60616)

* [SOT] Convert dtype to `DataType` in PIR mode (#60627)

* [PIR] Change output to block_arg from copy to a shared for the execution of while (#60607)

* test

* fix

* fix

* fix

* 【auto parallel】custom op spmd infer add args check (#60633)

* add bound check

* add bound check

* [PIR] Open PIR flag for test_ifelse (#60685)

* open pir flag for test_ifelse

* Update test_ifelse.py

* Update test_ifelse.py

* [CIN+PIR]Fix SplitOpPattern Bug in pd_to_cinn_pass (#60669)

* [CIN+PIR]Fix SplitOpPattern Bug in pd_to_cinn_pass

* fix index error

* refine pir_all_path UT

* fix bug

* fix uncontiguous tensor resize bug (#60684)

* fix uncontiguous tensor resize bug

* [PIR]Support inplace  custom op in pir (#60529)

* support inplace in pir

* fix inference ut

* fix win bugs

* fix win bug

* fix

* polish code

* polish code

* print log

* print log

* debug

* fix win bugs

* fix windows

* fix (#60634)

* [Docs] Update latest release version in README (#60691)

* [CINN] Refine cmake for pass in cinn (#60683)

* refine cmake for pass in cinn

* add dependency in cmake

* add dependency in cmake

* [PIR]Open uts for PReLU (#60645)

* [PIR]Open uts for ReLU6 (#60650)

* [PIR]Open uts for RReLU (#60660)

* [NPU] fix storage_properties type mismatch with OneDNN and NPU (#60566)

* fix ttfnet_darknet53_1x_coco in pir mode (#60663)

* [auto parallel] shard tensor stop gradient support (#60699)

* [PIR][DynamicShape] Polish some codes (#60651)

att, polish some codes

* [PIR] fix onednn double reg (#60720)

* fix onednn double reg

* 【pir】modify add_n in while use blockarg instead of input value (#60668)

* test

* fix

* fix

* fix

* modify add_n block_arg

* modify increment return value

* merge

* modfiy whiel_op.py

---------

Co-authored-by: zhangbo9674 <zhangbo54@baidu.com>

* [PIR] Open test_case ut (#60721)

* fix

* fix

* [PIR] rename data_layout (#60678)

* rename data_layout

* [xpu]: check op is null (#60656)

* 【Hackathon 5th No.1】 为 Paddle 新增 copysign API (#57785)

* add copysign op

* fix codestyle

* codestyle

* fix test

* fix std bug

* merge init

* merge init

* merge init

* add static cast

* add std

* static cast

* static cast

* copysignf

* static cast to float input

* float input

* static cast to double input

* fix

* add inplace test

* fix api

* fix cast when grad

* modify paddle.cast_ to cast_

* remove cast in python api

* support fp16 && bf16

* set grad y to zero

* fix en doc

* support number input

* add hostdevice

* refactor kernel

* fix nan when backward

* add broadcast unit test

* modify .cu

* Update __init__.py

* Update __init__.py

* for ci test

* static float

* codestyle

* static double

* fix broadcast, try coverage

* Delete paddle/phi/kernels/funcs/broadcast_function.h

* remove unused

* Update math.py

* Update math.py

* fix en doc

* add test for output dtype, integer unsupported for now

* update

* update

* fix

* fix

* add cast for input

* fix

* add pir test

* fix doc

* fix doc

* fix doc

* detail doc

* adjust for MSVC

* fix

* Update python/paddle/tensor/math.py

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* Update python/paddle/tensor/math.py

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* fix doc output dtype, fix Equation

* codestyle

* codestyle

* Update math.py

---------

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* rms_norm_infer_spmd (#60709)

* [PIR]Open more tests for bernoulli and celu (#60706)

* bernoulli && celu

* celu test_error

* [PIR]Open uts for scatter_nd_add (#60698)

* [PIR]Open uts for scatter_nd_add

* Fix ut

* [PIR]Open uts for sinh (#60714)

* [PIR]Open uts for Softshrink and Softsign (#60716)

* [PIR] polish the ir_mapping implimentation. (#60675)

* [PIR] fix onednn layout transform yaml format (#60680)

* fix onednn layout transform yaml format

* 【CINN】Complete error handler mechanism of dynamic schedule (#60718)

* complete error handler mechanism of dynamic schedule

* fix some output info

* fix windows C++17 bug (#60736)

* [XPU] fc pass and delete pass nodes check (#60314)

* fix_local_windows_compile (#60682)

* [PIR] fix onednn dialect name (#60665)

* fix onednn dialect name

* 【pir】add tesnor to array kernel etc (#60703)

* merge

* modfiy kernel

* modify net

* modify print

* Fix defition definition (#60679)

* cholesky and cholesky_solve tests (#60726)

* [PIR]Open uts for searchsorted (#60700)

* [PIR]Open uts for selu (#60702)

* [PIR]Open uts for selu

* Fix ut

* [PIR]Open uts for sequence_mask (#60704)

* [PIR] adjust pir pass log printing (#60723)

* adjust pir pass log printing

* update

* update

* update

* fix compile

* Fix Throughtput Throughput (#60741)

* please last md (#60749)

* [CINN+PIR]Fix Fetch XShape Variable logic (#60722)

* [PIR][DynamicShape] Remove redundant code for shapeAnalysis and shapedTypeInterface (#60744)

att, remove redundant code for shapeAnalysis and shapedTypeInterface

* 【PIR Dist Op Reg No.1】 reg push_sparse_v2 (#60473)

* code reg push_sparse_v2

* [Dynamic Shape] Provide operator<< For BroadcastTree (#60730)

* [PIR] change IR clone to const and support clone operation successors (#60752)

* support ir clone const and support clone operation successors

* refine ir_mapping

* refine region clone

* [CINN] Refine fully_insert_broadcast_pass (#60676)

* refine fully_insert_broadcast_pass

* fix complie bug

* fix complie

* fix conflict

* [PIR] einsum's inner_cache and xshape set to optional (#60748)

* einsum's inner_cache and xshape set to intermediate

* Update paddle/fluid/pir/dialect/operator/ir/ops.yaml

---------

Co-authored-by: kangguangli <kangguangli@hotmail.com>

* reduce runtime of unit-tests in windows-trt (#60731)

* modify trt test to deal with Timeout

* windows

* [Paddle-TRT] upgrade EnqueueV2 to EnqueueV3 (#59950)

* 【Hackathon 5th No.110】为 Paddle 增强 sparse.matmul API (#59890)

* Fix rank_relatvie rank_relative (#60770)

* add graph_key to specific graph's varmap (#60567)

* add graph_key to specific graph's varmap

* fix inpalce case

* fix inpalce case

* 【Hackathon 5th No.38】为 Paddle 新增 FractionalMaxPool2d / FractionalMaxPool3d API -kernel (#59847)

* [Init] add fractional max pool kernel and api

* [Fix] pooling.cu seed offset

* [Change] remove adaptive from fractional max pool

* [Change] fractional max 2d gpu pooling.cu grad

* [Change] fractional max 2d gpu pooling.cu grad with dim3

* [Change] use UnchangedInferMeta

* [Change] test api with uint16

* [Change] wrap test disable_static

* [Change] regiester float16/bfloat16

* [Change] remove bfloat16 from cpu kernrl

* [Change] test dtypes in cpu and gpu

* [Change] test_fractional_max_pool3d_2d/3d timeout to 30s

* [Fix] resolve conflict

* [Change] win32 cannot detect bfloat16 correctly

* [Change] force set_device

* [Add] test random_u is None

* [Change] use kernel_size for overlapping mode

* [Change] clean headers

* [CodeStyle] pooling

* [Change] rename op

* [Change] rename func without index

* [Prim][PIR] Recover pir bn (#60689)

* reopen bn prim pir

* fix atol

* decomp support batch_norm_

* fix test case

* fix bug

* fix  code

* [PIR]fc_with_special_op_fuse_pass bug fix (#60751)

* bug fix

update

* update

* delete all debug message

* add code deleted wrong at last commit

* delete createAutoMixedPrecisionPass in analysis_predictor.cc

---------

Co-authored-by: HongyuJia <jiahongyu@baidu.com>
Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com>
Co-authored-by: SigureMo <sigure.qaq@gmail.com>
Co-authored-by: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Co-authored-by: xingmingyyj <135400902+xingmingyyj@users.noreply.github.com>
Co-authored-by: JYChen <zoooo0820@qq.com>
Co-authored-by: Yuang Liu <liuyuang@baidu.com>
Co-authored-by: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>
Co-authored-by: kevin <chengyf112@gmail.com>
Co-authored-by: wanghuancoder <wanghuan29@baidu.com>
Co-authored-by: kangguangli <kangguangli@hotmail.com>
Co-authored-by: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com>
Co-authored-by: co63oc <co63oc@users.noreply.github.com>
Co-authored-by: NeroLoh <745827440@qq.com>
Co-authored-by: 傅剑寒 <Xs1580802568@gmail.com>
Co-authored-by: lzydev <lizhiyu02@baidu.com>
Co-authored-by: tianshuo78520a <707759223@qq.com>
Co-authored-by: houj04 <35131887+houj04@users.noreply.github.com>
Co-authored-by: Yuanle Liu <yuanlehome@163.com>
Co-authored-by: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Co-authored-by: 张春乔 <83450930+Liyulingyue@users.noreply.github.com>
Co-authored-by: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Co-authored-by: winter-wang <1030748926@qq.com>
Co-authored-by: BiynXu <62832681+BiynXu@users.noreply.github.com>
Co-authored-by: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Co-authored-by: Vigi Zhang <VigiZhang@users.noreply.github.com>
Co-authored-by: zbt78 <1095497213@qq.com>
Co-authored-by: liuzhenhai93 <liuzhenhai93@outlook.com>
Co-authored-by: Aurelius84 <zhangliujie@baidu.com>
Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Co-authored-by: Lu Qi <61354321+MarioLulab@users.noreply.github.com>
Co-authored-by: LoneRanger <836253168@qq.com>
Co-authored-by: freeliuzc <lzc842650834@gmail.com>
Co-authored-by: YibLiu <68105073+YibinLiu666@users.noreply.github.com>
Co-authored-by: engineer1109 <jialiang.wang@xdxct.com>
Co-authored-by: danleifeng <52735331+danleifeng@users.noreply.github.com>
Co-authored-by: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Co-authored-by: MayYouBeProsperous <ljmhz@outlook.com>
Co-authored-by: Huihuang Zheng <zhhsplendid@163.com>
Co-authored-by: gouzil <66515297+gouzil@users.noreply.github.com>
Co-authored-by: 6clc <chaoliu.lc@foxmail.com>
Co-authored-by: Terry <38135104+TR666@users.noreply.github.com>
Co-authored-by: winter-wang <78149749+winter-wang@users.noreply.github.com>
Co-authored-by: Wang Xin <xinwang614@gmail.com>
Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com>
Co-authored-by: Frank Lin <eee4017@gmail.com>
Co-authored-by: pangengzheng <117730991+pangengzheng@users.noreply.github.com>
Co-authored-by: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Co-authored-by: Tian Zheng <tizheng@nvidia.com>
Co-authored-by: lijialin03 <124568209+lijialin03@users.noreply.github.com>
Co-authored-by: Wangzheee <634486483@qq.com>
Co-authored-by: zhink <33270771+zhink@users.noreply.github.com>
Co-authored-by: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Co-authored-by: Chen Zhiyang <1792266893@qq.com>
Co-authored-by: feifei-111 <2364819892@qq.com>
Co-authored-by: fsczz <57291768+fsczz@users.noreply.github.com>
Co-authored-by: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Co-authored-by: Sonder <55493212+AndSonder@users.noreply.github.com>
Co-authored-by: Liujie0926 <44688141+Liujie0926@users.noreply.github.com>
Co-authored-by: WangZhen <23097963+0x45f@users.noreply.github.com>
Co-authored-by: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
Co-authored-by: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com>
Co-authored-by: Jianbang Yang <yangjianbang112@gmail.com>
Co-authored-by: enzodechine <enzo9533@hotmail.com>
Co-authored-by: Zhan Rongrui <46243324+zrr1999@users.noreply.github.com>
Co-authored-by: coco <69197635+cocoshe@users.noreply.github.com>
Co-authored-by: zhaohaixu <49297029+zhaohaixu@users.noreply.github.com>
Co-authored-by: chen2016013 <111894720+chen2016013@users.noreply.github.com>
Co-authored-by: zyfncg <zhangyunfei07@baidu.com>
Co-authored-by: Qi Li <qili93@qq.com>
Co-authored-by: zhangbo9674 <zhangbo54@baidu.com>
Co-authored-by: Liuyinfeng <30849840+gitliuyf@users.noreply.github.com>
Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>
Co-authored-by: wendaxiao <113992173+wenxiaohahaha@users.noreply.github.com>
Co-authored-by: cyberslack_lee <luhputu0815@gmail.com>
Co-authored-by: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Co-authored-by: GGBond8488 <33050871+GGBond8488@users.noreply.github.com>
Co-authored-by: megemini <megemini@outlook.com>
---
 .../fluid/inference/api/analysis_predictor.cc | 10 ----
 .../transforms/auto_mixed_precision_pass.cc   | 50 ++-----------------
 2 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index de6a2d0c97189..a8c73c3218398 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -105,7 +105,6 @@
 #endif
 
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
-#include "paddle/fluid/pir/transforms/auto_mixed_precision_pass.h"
 #include "paddle/fluid/pir/transforms/constant_folding_pass.h"
 #include "paddle/fluid/pir/transforms/dead_code_elimination_pass.h"
 #include "paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.h"
@@ -807,15 +806,6 @@ bool AnalysisPredictor::PrepareExecutor() {
 
         //----------------------------------------------------------------------------------------------//
         // Functional pass
-        // Do auto mixed precision pass first, so do not need to handle
-        // shadowoutput.
-        auto auto_mixed_precision_pass = ::pir::CreateAutoMixedPrecisionPass();
-        auto_mixed_precision_pass->SetNotOwned(pir::kPlaceAttr, &place_);
-        phi::DataType data_type =
-            ConvertPrecision(config_.mixed_precision_mode_);
-        auto_mixed_precision_pass->SetNotOwned("__mixed_precision_mode__",
-                                               &data_type);
-        gpu_pm.AddPass(std::move(auto_mixed_precision_pass));
         gpu_pm.AddPass(::pir::CreateIdentityOpCleanPass());
         //----------------------------------------------------------------------------------------------//
 
diff --git a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
index 6846ccc6b5753..0bf137bb09a23 100644
--- a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
@@ -82,20 +82,10 @@ class AutoMixedPrecisionPass : public pir::Pass {
     for (size_t i = 0; i < op->num_regions(); ++i) {
       auto& region = op->region(i);
       for (auto& block : region) {
-        VLOG(6) << "===========Get Op Precision============" << std::endl;
         GetOpPrecision(&block);
-        VLOG(6) << "===========Update Op Precision============" << std::endl;
         UpdateOpPrecision(&block);
-
-        VLOG(6) << "===========" << op_run_low_precision_.size() << " of "
-                << block.size() << " ops"
-                << " run low precision" << std::endl;
         pir::Builder builder = pir::Builder(context_, &block);
-        VLOG(6) << "===========Process Op Precision============" << std::endl;
-
         ProcessBlock(&block, builder);
-        VLOG(6) << "===========Insert Cast Op Num : " << insert_cast_op_num_
-                << "============" << std::endl;
       }
     }
   }
@@ -144,7 +134,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
   void GetOpPrecision(pir::Block* block) {
     for (auto& op_item : *block) {
       auto op = &op_item;
-      VLOG(6) << "op name " << op->name();
       auto op_name = op->name();
       bool support_low_precision = true;
       if (black_list_.count(op_name)) {
@@ -167,10 +156,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
       }
       if (support_low_precision) {
         op_run_low_precision_.insert(op);
-        VLOG(6) << "op " << op->name() << " support low precision" << std::endl;
-      } else {
-        VLOG(6) << "op " << op->name() << " doesn't support low precision"
-                << std::endl;
       }
     }
   }
@@ -235,8 +220,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
         }
         if (!OpRunLowPrecision(op)) continue;
         if (CheckOutputIsScalarAttribute(op)) {  // Output is ScalarAttribute
-          VLOG(6) << "op " << op->name() << " output is ScalarAttribute"
-                  << std::endl;
           op_run_low_precision_.erase(op);
           precision_updated = true;
         }
@@ -261,21 +244,10 @@ class AutoMixedPrecisionPass : public pir::Pass {
         }
       }
     } while (precision_updated);
-    for (auto& op_item : *block) {
-      auto op = &op_item;
-      if (op_should_not_handle_.count(op)) {
-        VLOG(6) << "op " << op->name() << " should not be handled" << std::endl;
-      } else if (op_run_low_precision_.count(op)) {
-        VLOG(6) << "op " << op->name() << " run low precision" << std::endl;
-      } else {
-        VLOG(6) << "op " << op->name() << " run high precision" << std::endl;
-      }
-    }
   }
 
   void RewriteOp(pir::Operation* op,
                  pir::Builder& builder) {  // NOLINT
-    VLOG(6) << "Rewrite op " << op->name() << std::endl;
     if (IsBuiltinOp(op)) {
       RewriteBuiltinOp(op, builder);
       return;
@@ -318,7 +290,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
       phi::DataType precision,
       phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) const {
     auto& phi_op_type = op_type;
-    VLOG(6) << "phi_op_type = " << phi_op_type << std::endl;
 
     bool support =
         PhiKernelSupportPrecision(phi_op_type, backend, precision, layout);
@@ -419,8 +390,8 @@ class AutoMixedPrecisionPass : public pir::Pass {
       auto new_vec_type = pir::VectorType::get(context, results_type);
       result.set_type(new_vec_type);
     } else {
-      VLOG(6) << "result type is not DenseTensorType or VectorType"
-              << std::endl;
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "result type is not DenseTensorType or VectorType"));
     }
   }
 
@@ -452,7 +423,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
                  IsVectorTypeFloat(result.type().dyn_cast<pir::VectorType>())) {
       }
     }
-    VLOG(6) << "op " << op->name() << " doesn't have float result" << std::endl;
     return false;
   }
 
@@ -517,10 +487,8 @@ class AutoMixedPrecisionPass : public pir::Pass {
 
   void RewriteBuiltinOp(pir::Operation* op,
                         pir::Builder& builder) {  // NOLINT
-    VLOG(6) << "Rewrite builtin op " << op->name() << std::endl;
     // Rewrite CombineOp
     if (op->isa<pir::CombineOp>()) {
-      // auto vec_type = op->result(0).type().dyn_cast<pir::VectorType>();
       auto input_num = op->num_operands();
       if (OpRunLowPrecision(op)) {
         for (size_t i = 0; i < input_num; ++i) {
@@ -572,10 +540,8 @@ class AutoMixedPrecisionPass : public pir::Pass {
 
   void RewritePdOp(pir::Operation* op,
                    pir::Builder& builder) {  // NOLINT
-    VLOG(6) << "Rewrite pd op " << op->name() << std::endl;
-    phi::Backend backend = ConvertPlaceToBackend(place_);
     std::string op_type = op->name().substr(op->name().find(".") + 1);
-
+    phi::Backend backend = ConvertPlaceToBackend(place_);
     // Rewrite FetchOp
     if (op->isa<paddle::dialect::FetchOp>()) {
       auto fetch_operand = op->operand(0);
@@ -587,7 +553,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
       auto result_dtype = paddle::dialect::TransToPhiDataType(
           pir::GetDataTypeFromValue(op->result(0)));
       if (fetch_operand_phi_dtype != result_dtype) {
-        VLOG(6) << "Insert CastOp for FetchOp" << std::endl;
         DoInsertCastOp(op, fetch_operand, result_dtype, builder);
       }
       return;
@@ -607,9 +572,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
     // Other pd ops
     if (OpRunLowPrecision(op)) {
       // change result's dtype to low precision
-      VLOG(6) << "Change result's dtype to low precision " << op->name()
-              << std::endl;
-
       if (op->HasAttribute("dtype") &&
           IsPhiDataTypeFloat(
               op->attribute<paddle::dialect::DataTypeAttribute>("dtype")
@@ -644,8 +606,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
         auto result = op->result(i);
         if (!result.type()) continue;
         phi::DataType out_phi_dtype = output_defs[i].dtype;
-        VLOG(6) << "result dtype = " << phi::DataTypeToString(out_phi_dtype)
-                << std::endl;
         if (out_phi_dtype == phi::DataType::UNDEFINED)
           out_phi_dtype = precision_mode_;
         if (!IsPhiDataTypeFloat(out_phi_dtype))
@@ -663,8 +623,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
         auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand);
         if (IsPhiDataTypeFloat(operand_phi_dtype) &&
             operand_phi_dtype != in_phi_dtype) {
-          VLOG(6) << "Support low precision, insert CastOp for " << op->name()
-                  << " operand " << i << std::endl;
           DoInsertCastOp(op, operand, in_phi_dtype, builder);
         }
       }
@@ -677,8 +635,6 @@ class AutoMixedPrecisionPass : public pir::Pass {
         auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand);
         if (IsPhiDataTypeFloat(operand_phi_dtype) &&
             operand_phi_dtype == precision_mode_) {
-          VLOG(6) << "Not support low precision, insert CastOp for "
-                  << op->name() << " operand " << i << std::endl;
           DoInsertCastOp(op, operand, phi_dtype, builder);
         }
       }