diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 5009dddce2f14..2df86e86a75e0 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -79,7 +79,8 @@ void AppendSkipDeletionVars(const std::vector &append_vars, * 2. it is an input var used in backward_op */ void ParseSafeEagerDeletionSkipVars( - const ProgramDesc &program, int64_t forward_op_nums, + const ProgramDesc &program, + int64_t forward_op_nums, const std::vector &output_var_names, std::vector *skip_eager_delete_vars) { auto all_ops = program.Block(0).AllOps(); @@ -143,8 +144,11 @@ ExecutorInfoCache &ExecutorInfoCache::Instance() { } static PEAndGraphPair CreateExecutorInfo( - const ProgramDesc &program_desc, const platform::Place &place, - int64_t start_op_index, int64_t end_op_index, framework::Scope *scope, + const ProgramDesc &program_desc, + const platform::Place &place, + int64_t start_op_index, + int64_t end_op_index, + framework::Scope *scope, const details::BuildStrategy &build_strategy) { auto execution_strategy = details::GetExecutionStrategy(place); auto graph = std::make_shared( @@ -162,15 +166,17 @@ PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc &program_desc, framework::Scope *scope) { details::BuildStrategy build_strategy; build_strategy.fix_op_run_order_ = true; - auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index, - end_op_index, scope, build_strategy); + auto pe_and_graph = CreateExecutorInfo( + program_desc, place, start_op_index, end_op_index, scope, build_strategy); return pe_and_graph; } CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc, const platform::Place &place, - int64_t start_op_index, int64_t end_op_index, - bool is_grad, int64_t program_id, + int64_t start_op_index, + int64_t end_op_index, + bool is_grad, + int64_t program_id, framework::Scope *scope) { auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); @@ -186,8 +192,12 @@ CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc, auto &build_strategy = cached_exe_info.GetBuildStrategy(program_id); // 2. Construct Graph and ParallelExecutor. - auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index, - end_op_index, scope, build_strategy); + auto pe_and_graph = CreateExecutorInfo(program_desc, + place, + start_op_index, + end_op_index, + scope, + build_strategy); // 3. Insert value into cached map. auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad); diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index d62acb759868c..f28696194e5f6 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -115,21 +115,25 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { auto type = std::string{"sgd"}; // auto LearningRate = op->Input("LearningRate"); auto use_nesterov = BOOST_GET_CONST(bool, op->GetAttr("use_nesterov")); - PADDLE_ENFORCE_EQ(use_nesterov, false, + PADDLE_ENFORCE_EQ(use_nesterov, + false, platform::errors::Unimplemented( "ipu does not support nesterov mode.")); auto regularization_method = BOOST_GET_CONST(std::string, op->GetAttr("regularization_method")); - PADDLE_ENFORCE_NE(regularization_method, "l1_decay", + PADDLE_ENFORCE_NE(regularization_method, + "l1_decay", platform::errors::Unimplemented( "ipu does not support l1_decay mode.")); auto multi_precision = BOOST_GET_CONST(bool, op->GetAttr("multi_precision")); - PADDLE_ENFORCE_EQ(multi_precision, false, + PADDLE_ENFORCE_EQ(multi_precision, + false, platform::errors::Unimplemented( "ipu does not support multi_precision mode.")); auto rescale_grad = BOOST_GET_CONST(float, op->GetAttr("rescale_grad")); - PADDLE_ENFORCE_EQ(rescale_grad, 1.0, + PADDLE_ENFORCE_EQ(rescale_grad, + 1.0, platform::errors::Unimplemented( "ipu does not support rescale_grad mode.")); auto regularization_coeff = @@ -150,10 +154,12 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { auto lazy_mode = BOOST_GET_CONST(bool, op->GetAttr("lazy_mode")); auto multi_precision = BOOST_GET_CONST(bool, op->GetAttr("multi_precision")); - PADDLE_ENFORCE_EQ(lazy_mode, false, + PADDLE_ENFORCE_EQ(lazy_mode, + false, platform::errors::Unimplemented( "ipu does not support lazy_mode mode.")); - PADDLE_ENFORCE_EQ(multi_precision, false, + PADDLE_ENFORCE_EQ(multi_precision, + false, platform::errors::Unimplemented( "ipu does not support multi_precision mode.")); new_op.SetAttr("type", type); @@ -268,11 +274,13 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "found loss op type: " << op->Type(); auto outputs = op->Outputs(); PADDLE_ENFORCE_EQ( - outputs.size(), 1, + outputs.size(), + 1, platform::errors::InvalidArgument("Can only support one loss key")); auto losses = outputs.begin()->second; PADDLE_ENFORCE_EQ( - losses.size(), 1, + losses.size(), + 1, platform::errors::InvalidArgument("Can only support one loss name")); auto loss_var = losses.front(); new_op.SetAttr("loss_var", loss_var); @@ -282,11 +290,13 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { } else if (op_type == "identity_loss") { auto outputs = op->Outputs(); PADDLE_ENFORCE_EQ( - outputs.size(), 1, + outputs.size(), + 1, platform::errors::InvalidArgument("Can only support one loss key")); auto losses = outputs.begin()->second; PADDLE_ENFORCE_EQ( - losses.size(), 1, + losses.size(), + 1, platform::errors::InvalidArgument("Can only support one loss name")); auto loss_var = losses.front(); new_op.SetAttr("loss_var", loss_var); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f5340f08677e2..189328e95abac 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -55,11 +55,13 @@ DECLARE_bool(sync_nccl_allreduce); #include "gperftools/profiler.h" #endif PADDLE_DEFINE_EXPORTED_string( - pe_profile_fname, "", + pe_profile_fname, + "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); PADDLE_DEFINE_EXPORTED_bool( - enable_parallel_graph, false, + enable_parallel_graph, + false, "Force disable parallel graph execution mode if set false."); namespace paddle { @@ -166,8 +168,8 @@ class ParallelExecutorPrivate { std::vector flat_nccl_ids; if (nranks_ == 1) { // FIXME(gongwb): need not to create ncclid when nranks==1 - nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, - bst.trainer_id_); + nccl_ctxs_->InitFlatCtxs( + places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); return; } @@ -184,7 +186,8 @@ class ParallelExecutorPrivate { } else { nccl_id = new ncclUniqueId(); PADDLE_ENFORCE_EQ( - platform::dynload::ncclGetUniqueId(nccl_id), ncclSuccess, + platform::dynload::ncclGetUniqueId(nccl_id), + ncclSuccess, platform::errors::PreconditionNotMet( "PaddlePaddle failed to get NCCL unique ID. It may due to your " "system settings or NCCL library error, please debug on NCCL")); @@ -194,16 +197,16 @@ class ParallelExecutorPrivate { flat_nccl_ids.push_back(nccl_id); - nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, - bst.trainer_id_); + nccl_ctxs_->InitFlatCtxs( + places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); VLOG(1) << "init bst nccl context complete!"; return; } // num_trainers ==1 && places > 1 if (bst.num_trainers_ == 1) { - nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, - bst.trainer_id_); + nccl_ctxs_->InitFlatCtxs( + places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); return; } @@ -217,8 +220,8 @@ class ParallelExecutorPrivate { flat_nccl_ids.push_back(nccl_id); } - nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, - bst.trainer_id_); + nccl_ctxs_->InitFlatCtxs( + places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); if (bst.use_hierarchical_allreduce_) { std::vector inter_nccl_ids; @@ -244,8 +247,12 @@ class ParallelExecutorPrivate { } nccl_ctxs_->InitHierarchicalCtxs( - places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_, - bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_, + places_, + inter_nccl_ids, + exter_nccl_ids, + bst.num_trainers_, + bst.trainer_id_, + bst.hierarchical_allreduce_inter_nranks_, bst.hierarchical_allreduce_exter_nranks_); } } @@ -254,7 +261,8 @@ class ParallelExecutorPrivate { const std::string var_name = "NCCLCommunicator"; auto var = scope->FindVar(var_name); if (var != nullptr) { - PADDLE_ENFORCE_EQ(var->IsInitialized(), true, + PADDLE_ENFORCE_EQ(var->IsInitialized(), + true, platform::errors::PreconditionNotMet( "if %s exists, it must be initialized", var_name)); VLOG(1) << "find " << var_name @@ -265,19 +273,23 @@ class ParallelExecutorPrivate { if (bst->use_hierarchical_allreduce_) { PADDLE_ENFORCE_GT( - bst->num_trainers_, 1, + bst->num_trainers_, + 1, platform::errors::PreconditionNotMet( "The num_trainers should be greater than 1, but received %llu.", bst->num_trainers_)); PADDLE_ENFORCE_GT( - bst->hierarchical_allreduce_inter_nranks_, 1, + bst->hierarchical_allreduce_inter_nranks_, + 1, platform::errors::PreconditionNotMet( "The inter_nranks should be greater than 1, but received %d.", bst->hierarchical_allreduce_inter_nranks_)); PADDLE_ENFORCE_EQ( - bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_, 0, + bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_, + 0, platform::errors::PreconditionNotMet( - "num_trainers:%llu mod inter_nranks:%d != 0", bst->num_trainers_, + "num_trainers:%llu mod inter_nranks:%d != 0", + bst->num_trainers_, bst->hierarchical_allreduce_inter_nranks_)); bst->hierarchical_allreduce_exter_nranks_ = @@ -296,15 +308,16 @@ class ParallelExecutorPrivate { << ", num_trainers:" << bst.num_trainers_ << ", trainer_id:" << bst.trainer_id_; - PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_, false, + PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_, + false, platform::errors::Unimplemented( "xpu doesn't support use_hierarchical_allreduce")); std::vector flat_bkcl_ids; if (nranks_ == 1) { // FIXME(gongwb): need not to create bkclid when nranks==1 - bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, - bst.trainer_id_); + bkcl_ctxs_->InitFlatCtxs( + places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); return; } @@ -320,23 +333,24 @@ class ParallelExecutorPrivate { bkcl_id = bkcl_id_var->GetMutable(); } else { PADDLE_ENFORCE_EQ( - bkcl_get_unique_id(id.get()), BKCL_SUCCESS, + bkcl_get_unique_id(id.get()), + BKCL_SUCCESS, platform::errors::Unavailable("bkcl get unique id failed")); bkcl_id = id.get(); } flat_bkcl_ids.push_back(bkcl_id); - bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, - bst.trainer_id_); + bkcl_ctxs_->InitFlatCtxs( + places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); VLOG(1) << "init bst bkcl context complete!"; return; } // num_trainers ==1 && places > 1 if (bst.num_trainers_ == 1) { - bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, - bst.trainer_id_); + bkcl_ctxs_->InitFlatCtxs( + places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); return; } @@ -350,8 +364,8 @@ class ParallelExecutorPrivate { flat_bkcl_ids.push_back(bkcl_id); } - bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, - bst.trainer_id_); + bkcl_ctxs_->InitFlatCtxs( + places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); } void InitOrGetBKCLCommunicator(framework::Scope *scope, @@ -359,7 +373,8 @@ class ParallelExecutorPrivate { const std::string var_name = "BKCLCommunicator"; auto var = scope->FindVar(var_name); if (var != nullptr) { - PADDLE_ENFORCE_EQ(var->IsInitialized(), true, + PADDLE_ENFORCE_EQ(var->IsInitialized(), + true, platform::errors::PreconditionNotMet( "if %s exists, it must be initialized", var_name)); VLOG(1) << "find " << var_name @@ -679,11 +694,11 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, platform::errors::Unavailable( "NPU is not supported in ParallelExecutor.")); InitP2P(places); - ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), - member_->places_.size()); + ir::InitReaderQueueDeviceCount( + graph, *(member_->global_scope_), member_->places_.size()); // Initialize necessary info of member_ with strategy. - InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(), - *graph); + InitExecutorPrivateMemberInfo( + exec_strategy, build_strategy, places.size(), *graph); // Step 1. Create local scopes and Clone graph into multi device CreateLocalScopes(scope, local_scopes, /*create_new*/ true); @@ -728,22 +743,29 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; if (!member_->build_strategy_.async_mode_) { member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - std::move(var_infos), member_->places_, std::move(member_->executor_))); + exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + std::move(var_infos), + member_->places_, + std::move(member_->executor_))); } ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map); SetReaderOpDeviceInfoOfGraphs(final_graphs); } -ParallelExecutor::ParallelExecutor(const platform::Place &place, Scope *scope, +ParallelExecutor::ParallelExecutor(const platform::Place &place, + Scope *scope, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate({place}, scope)) { // Initialize necessary info of member_ with strategy. - InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, - /*device_count=*/1, *graph); + InitExecutorPrivateMemberInfo(exec_strategy, + build_strategy, + /*device_count=*/1, + *graph); CreateLocalScopes(scope, /*local_scope=*/{scope}, /*create_new=*/false); @@ -819,18 +841,24 @@ void ParallelExecutor::BCastParamsToDevices( buffers.push_back(buffer); } - PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + PADDLE_ENFORCE_EQ(member_->places_.size(), + buffers.size(), platform::errors::PreconditionNotMet( "variables' buffer size to bcast is %d, which is " "NOT equal to places size %d", - buffers.size(), member_->places_.size())); + buffers.size(), + member_->places_.size())); if (member_->nccl_ctxs_ != nullptr) { auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx(); platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]); - platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); + platform::dynload::ncclBcast(buffers[i], + numel, + data_type, + 0, + nccl_ctx.comm_, + nccl_ctx.stream()); } nccl_ctxs->WaitAll(); } else { @@ -844,8 +872,12 @@ void ParallelExecutor::BCastParamsToDevices( platform::DeviceContextPool::Instance().Get(dst_place)); src_dev_ctx->Wait(); dst_dev_ctx->Wait(); - memory::Copy(dst_place, buffers[i], src_place, buffers[0], - sizeof_dtype, src_dev_ctx->stream()); + memory::Copy(dst_place, + buffers[i], + src_place, + buffers[0], + sizeof_dtype, + src_dev_ctx->stream()); src_dev_ctx->Wait(); dst_dev_ctx->Wait(); } @@ -879,16 +911,19 @@ void ParallelExecutor::BCastParamsToDevices( buffers.push_back(buffer); } - PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + PADDLE_ENFORCE_EQ(member_->places_.size(), + buffers.size(), platform::errors::PreconditionNotMet( "variables' buffer size to bcast is %d, which is " "NOT equal to places size %d", - buffers.size(), member_->places_.size())); + buffers.size(), + member_->places_.size())); { auto *bkcl_ctxs = member_->bkcl_ctxs_->DefaultFlatCtx(); PADDLE_ENFORCE_EQ( - bkcl_group_start(), BKCL_SUCCESS, + bkcl_group_start(), + BKCL_SUCCESS, platform::errors::Unavailable("bkcl_group_start failed")); for (size_t i = 0; i < member_->places_.size(); ++i) { auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[i]); @@ -898,13 +933,19 @@ void ParallelExecutor::BCastParamsToDevices( broadcast_numel *= 2; } PADDLE_ENFORCE_EQ( - bkcl_broadcast(bkcl_ctx.comm(), buffers[i], buffers[i], - broadcast_numel, data_type, 0, NULL), + bkcl_broadcast(bkcl_ctx.comm(), + buffers[i], + buffers[i], + broadcast_numel, + data_type, + 0, + NULL), BKCL_SUCCESS, platform::errors::Unavailable("bkcl_broadcast failed")); } PADDLE_ENFORCE_EQ( - bkcl_group_end(), BKCL_SUCCESS, + bkcl_group_end(), + BKCL_SUCCESS, platform::errors::Unavailable("bkcl_group_end failed")); } #else @@ -942,21 +983,24 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { - platform::RecordEvent record_run("ParallelExecutor::Run", - platform::TracerEventType::UserDefined, 1); + platform::RecordEvent record_run( + "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1); VLOG(3) << "enter ParallelExecutor Run"; #ifdef PADDLE_WITH_CUDA if (platform::IsCUDAGraphCapturing()) { - PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true, + PADDLE_ENFORCE_EQ(fetch_tensors.empty(), + true, platform::errors::InvalidArgument( "Cannot fetch data when using CUDA Graph.")); PADDLE_ENFORCE_EQ( - member_->build_strategy_.allow_cuda_graph_capture_, true, + member_->build_strategy_.allow_cuda_graph_capture_, + true, platform::errors::InvalidArgument( "You must turn on build_strategy.allow_cuda_graph_capture = True " "to enable CUDA Graph capturing.")); PADDLE_ENFORCE_EQ( - member_->places_[0], platform::CUDAGraphCapturingPlace(), + member_->places_[0], + platform::CUDAGraphCapturingPlace(), platform::errors::InvalidArgument("The place to capture CUDAGraph is " "not the same as the place to run.")); } @@ -972,7 +1016,8 @@ FetchResultType ParallelExecutor::Run( ResetHasFeedGuard reset_has_feed_guard(member_); - ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), fetch_tensors, + ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), + fetch_tensors, member_->HasGarbageCollectors()); VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; @@ -992,7 +1037,8 @@ void ParallelExecutor::RunWithoutFetch( ResetHasFeedGuard reset_has_feed_guard(member_); - ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), skip_eager_vars, + ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), + skip_eager_vars, member_->HasGarbageCollectors()); VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; @@ -1015,7 +1061,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( if (platform::IsCUDAGraphCapturing()) { for (auto &tensor : tensors) { PADDLE_ENFORCE_EQ( - tensor.empty(), true, + tensor.empty(), + true, platform::errors::PermissionDenied( "Feeding data is not permitted when capturing CUDA Graph.")); } @@ -1023,7 +1070,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( } if (!member_->AllowPartialFeed()) { - PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(), + PADDLE_ENFORCE_EQ(tensors.size(), + member_->local_scopes_.size(), platform::errors::Unimplemented( "The feed data number %d does not match the device " "number %d. If you are using DataLoader to feed " @@ -1031,9 +1079,11 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( "in training network. Currently, drop_last=False for " "DataLoader is not supported for training network. " "Please set drop_last=True when defining DataLoader.", - tensors.size(), member_->local_scopes_.size())); + tensors.size(), + member_->local_scopes_.size())); } else { - PADDLE_ENFORCE_GE(member_->local_scopes_.size(), tensors.size(), + PADDLE_ENFORCE_GE(member_->local_scopes_.size(), + tensors.size(), platform::errors::InvalidArgument( "The feed tensor number exceeds the device number")); } @@ -1063,7 +1113,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( } if (!member_->AllowPartialFeed()) { - PADDLE_ENFORCE_EQ(feed_num, member_->local_scopes_.size(), + PADDLE_ENFORCE_EQ(feed_num, + member_->local_scopes_.size(), platform::errors::Unimplemented( "The feed data number %d does not match the device " "number %d. If you are using DataLoader to feed " @@ -1071,7 +1122,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( "in training network. Currently, drop_last=False for " "DataLoader is not supported for training network. " "Please set drop_last=True when defining DataLoader.", - feed_num, member_->local_scopes_.size())); + feed_num, + member_->local_scopes_.size())); } } @@ -1079,7 +1131,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors) { if (platform::IsCUDAGraphCapturing()) { PADDLE_ENFORCE_EQ( - tensors.empty(), true, + tensors.empty(), + true, platform::errors::PermissionDenied( "Feeding data is not permitted when capturing CUDA Graph.")); return; @@ -1103,7 +1156,9 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( auto error_info = string::Sprintf( "The number(%d) of samples[%s] of current batch is less than the " "count(%d) of devices(%s), currently, it is not allowed. ", - lod_tensors.size(), pair.first, num_places, + lod_tensors.size(), + pair.first, + num_places, (is_cpu_place ? "CPU" : "GPU")); if (is_cpu_place) { error_info += @@ -1116,10 +1171,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( lod_tensors.reserve(num_places); auto &tensor = lod_tensors.front(); PADDLE_ENFORCE_EQ( - tensor.dims(), pair.second.dims(), + tensor.dims(), + pair.second.dims(), platform::errors::PreconditionNotMet("The dim doesn't match.")); PADDLE_ENFORCE_EQ( - tensor.place(), member_->places_.at(0), + tensor.place(), + member_->places_.at(0), platform::errors::PreconditionNotMet("The place doesn't match.")); for (size_t i = 1; i < num_places; ++i) { lod_tensors.emplace_back(); @@ -1135,8 +1192,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( "sample will be copied in %d copies and be sent to different " "places separately. If you need that different place has different " "value, you should feed %d samples.", - lod_tensors.size(), pair.first, num_places, - (is_cpu_place ? "CPU" : "GPU"), pair.first, num_places, num_places); + lod_tensors.size(), + pair.first, + num_places, + (is_cpu_place ? "CPU" : "GPU"), + pair.first, + num_places, + num_places); PADDLE_THROW(platform::errors::PreconditionNotMet(error_info)); } } @@ -1147,7 +1209,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( persistable_feed_len = lod_tensors.size(); } else { PADDLE_ENFORCE_EQ( - persistable_feed_len, lod_tensors.size(), + persistable_feed_len, + lod_tensors.size(), platform::errors::InvalidArgument( "The feeded number of different persistable variables " "should be the same")); @@ -1157,7 +1220,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( non_persistable_feed_len = lod_tensors.size(); } else { PADDLE_ENFORCE_EQ( - non_persistable_feed_len, lod_tensors.size(), + non_persistable_feed_len, + lod_tensors.size(), platform::errors::InvalidArgument( "The feeded number of different non-persistable variables " "should be the same")); @@ -1180,7 +1244,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( non_persistable_feed_len != -1UL) { VLOG(10) << "Persistable len " << persistable_feed_len; VLOG(10) << "Non persistable len " << non_persistable_feed_len; - PADDLE_ENFORCE_GE(persistable_feed_len, non_persistable_feed_len, + PADDLE_ENFORCE_GE(persistable_feed_len, + non_persistable_feed_len, platform::errors::InvalidArgument( "The feeded number of persistable variables should " "not be less than non-persistable variables")); @@ -1201,7 +1266,8 @@ ParallelExecutor::~ParallelExecutor() { } bool ParallelExecutor::EnableParallelGraphExecution( - const ir::Graph &graph, const ExecutionStrategy &exec_strategy, + const ir::Graph &graph, + const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const { if (!FLAGS_enable_parallel_graph) { return false; @@ -1242,8 +1308,10 @@ bool ParallelExecutor::EnableParallelGraphExecution( } void ParallelExecutor::InitExecutorPrivateMemberInfo( - const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - size_t device_count, const ir::Graph &graph) { + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + size_t device_count, + const ir::Graph &graph) { member_->use_device_ = exec_strategy.use_device_; member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = member_->build_strategy_.reduce_ == @@ -1259,7 +1327,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( - device_count, 1, + device_count, + 1, platform::errors::Unavailable("Windows can support Single GPU only.")); } #endif @@ -1268,7 +1337,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( - device_count, 1, + device_count, + 1, platform::errors::PermissionDenied( "Your machine has multiple cards, " "but the WITH_NCCL option is not turned on during compilation, " @@ -1289,14 +1359,16 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( VLOG(1) << string::Sprintf( "The Program will be executed on %s using ParallelExecutor, %lu " "cards are used, so %lu programs are executed in parallel.", - device_name, device_count, device_count); + device_name, + device_count, + device_count); // FIXME(Yancey1989): parallel graph mode get better performance // in GPU allreduce distributed training. Need an elegant way to // choice the execution strategy. member_->build_strategy_.enable_parallel_graph_ = - EnableParallelGraphExecution(graph, exec_strategy, - member_->build_strategy_); + EnableParallelGraphExecution( + graph, exec_strategy, member_->build_strategy_); if (member_->build_strategy_.enable_parallel_graph_) { LOG(INFO) << "The Executor would execute the graph by ParallelGraph " "Execution which can get better performance," @@ -1305,7 +1377,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( } void ParallelExecutor::CreateLocalScopes( - Scope *global_scope, const std::vector &local_scopes, + Scope *global_scope, + const std::vector &local_scopes, bool create_new) { if (local_scopes.empty()) { member_->own_local_scope_ = true; @@ -1315,11 +1388,13 @@ void ParallelExecutor::CreateLocalScopes( } } else { member_->own_local_scope_ = false; - PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), + PADDLE_ENFORCE_EQ(member_->places_.size(), + local_scopes.size(), platform::errors::PreconditionNotMet( "member_->places_.size() = %d is not equal to " "local_scopes.size() = %d", - member_->places_.size(), local_scopes.size())); + member_->places_.size(), + local_scopes.size())); for (size_t i = 0; i < member_->places_.size(); ++i) { if (create_new) { member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); @@ -1344,12 +1419,13 @@ std::unordered_map ParallelExecutor::CreateLocalExecScopes( scope_map.emplace(scope, local_exec_scope); } - PADDLE_ENFORCE_EQ( - member_->local_scopes_.size(), member_->local_exec_scopes_.size(), - platform::errors::PreconditionNotMet( - "member_->local_scopes_.size() = %d is not equal to " - "member_->local_exec_scopes_.size() = %d", - member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), + member_->local_exec_scopes_.size(), + platform::errors::PreconditionNotMet( + "member_->local_scopes_.size() = %d is not equal to " + "member_->local_exec_scopes_.size() = %d", + member_->local_scopes_.size(), + member_->local_exec_scopes_.size())); return scope_map; } @@ -1358,7 +1434,8 @@ std::vector ParallelExecutor::CloneGraphToMultiDevices( ir::Graph *graph) { std::vector graphs; if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, + PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), + false, platform::errors::Unavailable( "gpu mode does not support async_mode_ now!")); graphs.push_back(graph); @@ -1424,7 +1501,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { } std::vector ParallelExecutor::CompileGraphWithBuildStrategy( - ir::Graph *graph, std::vector *device_graphs, + ir::Graph *graph, + std::vector *device_graphs, const std::string &loss_var_name) { auto device_count = member_->places_.size(); std::vector async_graphs(device_count); @@ -1432,66 +1510,99 @@ std::vector ParallelExecutor::CompileGraphWithBuildStrategy( auto &graphs = *device_graphs; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(graphs.size(), device_count, + PADDLE_ENFORCE_EQ(graphs.size(), + device_count, platform::errors::PreconditionNotMet( "graphs.size() shoule be %d, but received %d", - device_count, graphs.size())); + device_count, + graphs.size())); VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->nccl_ctxs_); + graph = member_->build_strategy_.Apply(graph, + {member_->places_[0]}, + loss_var_name, + {member_->local_scopes_[0]}, + 1, + member_->use_device_, + member_->nccl_ctxs_); for (size_t i = 1; i < device_count; ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->nccl_ctxs_); + graphs[i] = member_->build_strategy_.Apply(graphs[i], + {member_->places_[i]}, + loss_var_name, + {member_->local_scopes_[i]}, + 1, + member_->use_device_, + member_->nccl_ctxs_); async_graphs[i] = graphs[i]; } } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->nccl_ctxs_); + graph = member_->build_strategy_.Apply(graph, + member_->places_, + loss_var_name, + member_->local_scopes_, + member_->nranks_, + member_->use_device_, + member_->nccl_ctxs_); } #elif defined(PADDLE_WITH_XPU_BKCL) if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(graphs.size(), device_count, + PADDLE_ENFORCE_EQ(graphs.size(), + device_count, platform::errors::PreconditionNotMet( "graphs.size() shoule be %d, but received %d", - device_count, graphs.size())); + device_count, + graphs.size())); VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->bkcl_ctxs_); + graph = member_->build_strategy_.Apply(graph, + {member_->places_[0]}, + loss_var_name, + {member_->local_scopes_[0]}, + 1, + member_->use_device_, + member_->bkcl_ctxs_); for (size_t i = 1; i < device_count; ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->bkcl_ctxs_); + graphs[i] = member_->build_strategy_.Apply(graphs[i], + {member_->places_[i]}, + loss_var_name, + {member_->local_scopes_[i]}, + 1, + member_->use_device_, + member_->bkcl_ctxs_); async_graphs[i] = graphs[i]; } } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); + graph = member_->build_strategy_.Apply(graph, + member_->places_, + loss_var_name, + member_->local_scopes_, + member_->nranks_, + member_->use_device_, + member_->bkcl_ctxs_); } #else if (member_->build_strategy_.async_mode_) { VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_); + graph = member_->build_strategy_.Apply(graph, + {member_->places_[0]}, + loss_var_name, + {member_->local_scopes_[0]}, + 1, + member_->use_device_); for (size_t i = 1; i < device_count; ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_); + graphs[i] = member_->build_strategy_.Apply(graphs[i], + {member_->places_[i]}, + loss_var_name, + {member_->local_scopes_[i]}, + 1, + member_->use_device_); async_graphs[i] = graphs[i]; } } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_); + graph = member_->build_strategy_.Apply(graph, + member_->places_, + loss_var_name, + member_->local_scopes_, + member_->nranks_, + member_->use_device_); } #endif @@ -1501,11 +1612,13 @@ std::vector ParallelExecutor::CompileGraphWithBuildStrategy( void ParallelExecutor::CreateVariableInfos( std::vector *var_infos, ir::Graph *graph) { PADDLE_ENFORCE_EQ( - var_infos->size(), 0, + var_infos->size(), + 0, platform::errors::PreconditionNotMet( "var_infos->size() shoule be 0, but received %d", var_infos->size())); PADDLE_ENFORCE_EQ( - member_->is_persistable_.size(), 0, + member_->is_persistable_.size(), + 0, platform::errors::PreconditionNotMet( "member_->is_persistable_.size() shoule be 0, but received %d", member_->is_persistable_.size())); @@ -1535,14 +1648,18 @@ void ParallelExecutor::CreateVariableInfos( std::vector ParallelExecutor::CreateSSAGraphExecutor( const ExecutionStrategy &exec_strategy, - std::vector *async_graphs, ir::Graph *graph) { + std::vector *async_graphs, + ir::Graph *graph) { std::vector final_graphs; if (member_->build_strategy_.async_mode_) { VLOG(3) << "use AsyncSSAGraphExecutor"; - member_->executor_.reset(new details::AsyncSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, *async_graphs)); + member_->executor_.reset( + new details::AsyncSSAGraphExecutor(exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + member_->places_, + *async_graphs)); final_graphs = *async_graphs; } else if (member_->build_strategy_.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; @@ -1552,9 +1669,12 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( bool is_inference = details::IsDataParallelInferenceGraph(*graph); bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph); + auto *pg_exe = + new details::ParallelSSAGraphExecutor(exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + member_->places_, + graph); final_graphs = pg_exe->Graphs(); member_->executor_.reset(pg_exe); @@ -1580,8 +1700,11 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, std::move(possible_inference_graphs)); + exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + member_->places_, + std::move(possible_inference_graphs)); if (!has_drop_last_read_op) { VLOG(5) << "Enable partial feed support in inference phase"; pg_exe->EnablePartialFeedSupport(); @@ -1598,16 +1721,22 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( "network. It is automatically turned to drop_last=True."; if (exec_strategy.type_ == ExecutionStrategy::kDefault) { VLOG(3) << "use ThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph)); + member_->executor_.reset( + new details::ThreadedSSAGraphExecutor(exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + member_->places_, + graph)); } else { if (member_->use_device_ == p::kXPU) { #if defined(PADDLE_WITH_XPU) VLOG(3) << "use BindThreadedSSAGraphExecutor"; member_->executor_.reset(new details::BindThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); + exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + member_->places_, + graph)); #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use XPU device since it's not compiled with XPU," @@ -1616,8 +1745,11 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( } else { VLOG(3) << "use FastThreadedSSAGraphExecutor"; member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); + exec_strategy, + member_->local_scopes_, + member_->local_exec_scopes_, + member_->places_, + graph)); } } final_graphs.emplace_back(graph); @@ -1630,12 +1762,14 @@ void ParallelExecutor::ResetOpHandleScopeMapOfGraphs( const std::vector &final_graphs, const std::unordered_map &scope_map) { PADDLE_ENFORCE_GE( - final_graphs.size(), 1, + final_graphs.size(), + 1, platform::errors::PreconditionNotMet( "final_graphs shoule contain at least one graph, but received %d", final_graphs.size())); - PADDLE_ENFORCE_GT(scope_map.size(), 0, + PADDLE_ENFORCE_GT(scope_map.size(), + 0, platform::errors::PreconditionNotMet( "scope_map shoule contain at least one " "element, but received %d", @@ -1676,21 +1810,26 @@ void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) { if (!build_strategy.allow_cuda_graph_capture_) return; #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_EQ( - build_strategy.async_mode_, false, + build_strategy.async_mode_, + false, platform::errors::InvalidArgument( "Async Executor does not support CUDA Graph capturing.")); PADDLE_ENFORCE_EQ( - platform::IsCUDAGraphCapturing(), false, + platform::IsCUDAGraphCapturing(), + false, platform::errors::PermissionDenied("CUDA Graph is not allowed to capture " "when running the first batch.")); PADDLE_ENFORCE_EQ( - member_->places_.size(), 1, + member_->places_.size(), + 1, platform::errors::InvalidArgument( "CUDA Graph is only supported when one GPU device is running.")); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true, + PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), + true, platform::errors::InvalidArgument( "CUDA Graph is only supported on NVIDIA GPU device.")); - PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false, + PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, + false, platform::errors::InvalidArgument( "FLAGS_sync_nccl_allreduce must be False to support " "CUDA Graph capturing.")); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index ac7bf6d87da2d..3015224656890 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -91,8 +91,8 @@ void HandleComplexGradToRealGrad(const NameVarMap& outs) { << framework::DataTypeToString(var->ForwardDataType()) << " real var in dynamic graph."; framework::Tensor out; - framework::TransComplexToReal(var->ForwardDataType(), var->DataType(), - *tensor, &out); + framework::TransComplexToReal( + var->ForwardDataType(), var->DataType(), *tensor, &out); SetTensorToVariable(var->Var(), out, var->MutableVar()); } } @@ -147,8 +147,10 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, template PreparedOp PrepareImpl( - const NameVarMap& ins, const NameVarMap& outs, - const framework::OperatorWithKernel& op, const platform::Place& place, + const NameVarMap& ins, + const NameVarMap& outs, + const framework::OperatorWithKernel& op, + const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const phi::KernelFactory& phi_kernel_factory, @@ -267,9 +269,14 @@ PreparedOp PrepareImpl( dev_ctx = pool.Get(expected_kernel_key.place_); } - return PreparedOp(op, empty_ctx, expected_kernel_key, arg_map_fn, - default_kernel_signature, std::move(kernel_signature), - phi_kernel, dev_ctx); + return PreparedOp(op, + empty_ctx, + expected_kernel_key, + arg_map_fn, + default_kernel_signature, + std::move(kernel_signature), + phi_kernel, + dev_ctx); } else { VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; @@ -318,16 +325,21 @@ PreparedOp PrepareImpl( << " | kernel: " << pt_cpu_kernel; auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace()); return PreparedOp( - op, empty_ctx, + op, + empty_ctx, framework::TransPhiKernelKeyToOpKernelType(pt_cpu_kernel_key), - arg_map_fn, default_kernel_signature, std::move(kernel_signature), - pt_cpu_kernel, cpu_ctx); + arg_map_fn, + default_kernel_signature, + std::move(kernel_signature), + pt_cpu_kernel, + cpu_ctx); } } } PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), + kernels_iter, + all_op_kernels.end(), platform::errors::NotFound( "There are no kernels which are registered in the %s operator.", op.Type())); @@ -413,17 +425,24 @@ PreparedOp PrepareImpl( #endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that // case - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator %s does not have kernel for %s.", op.Type(), - KernelTypeToString(expected_kernel_key))); + PADDLE_ENFORCE_NE( + kernel_iter, + kernels.end(), + platform::errors::NotFound("Operator %s does not have kernel for %s.", + op.Type(), + KernelTypeToString(expected_kernel_key))); if (!(expected_kernel_key.place_ == place)) { dev_ctx = pool.Get(expected_kernel_key.place_); } - return PreparedOp(op, empty_ctx, expected_kernel_key, kernel_iter->second, - arg_map_fn, default_kernel_signature, dev_ctx); + return PreparedOp(op, + empty_ctx, + expected_kernel_key, + kernel_iter->second, + arg_map_fn, + default_kernel_signature, + dev_ctx); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -432,8 +451,14 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, default_attrs, - phi_kernel_factory, phi_op_utils_map, + return PrepareImpl(ins, + outs, + op, + place, + attrs, + default_attrs, + phi_kernel_factory, + phi_op_utils_map, default_phi_kernel_sig_map); } @@ -443,9 +468,15 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl( - ins, outs, op, place, attrs, default_attrs, phi_kernel_factory, - phi_op_utils_map, default_phi_kernel_sig_map); + return PrepareImpl(ins, + outs, + op, + place, + attrs, + default_attrs, + phi_kernel_factory, + phi_op_utils_map, + default_phi_kernel_sig_map); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -454,39 +485,55 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl( - ins, outs, op, place, attrs, default_attrs, phi_kernel_factory, - phi_op_utils_map, default_phi_kernel_sig_map); + return PrepareImpl(ins, + outs, + op, + place, + attrs, + default_attrs, + phi_kernel_factory, + phi_op_utils_map, + default_phi_kernel_sig_map); } template static void PreparedOpRunImpl( - const framework::OperatorBase& op, const framework::RuntimeContext& ctx, + const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::OperatorWithKernel::OpKernelFunc& func, const phi::ArgumentMappingFn* arg_map_fn, const phi::KernelSignature* default_kernel_signature, - platform::DeviceContext* dev_ctx, const NameVarMap& ins, - const NameVarMap& outs, const framework::AttributeMap& attrs, + platform::DeviceContext* dev_ctx, + const NameVarMap& ins, + const NameVarMap& outs, + const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { // TODO(zjl): remove scope in dygraph { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); - DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type, - arg_map_fn, default_kernel_signature); + 1, + platform::EventRole::kInnerOp); + DygraphInferShapeContext infer_shape_ctx(&ins, + &outs, + &attrs, + &default_attrs, + op.Type(), + &kernel_type, + arg_map_fn, + default_kernel_signature); op.Info().infer_shape_(&infer_shape_ctx); } { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); - func(DygraphExecutionContext(op, empty_scope, *dev_ctx, ctx, ins, - outs, attrs, default_attrs)); + func(DygraphExecutionContext( + op, empty_scope, *dev_ctx, ctx, ins, outs, attrs, default_attrs)); } if (FLAGS_check_nan_inf) { @@ -525,30 +572,45 @@ static void PreparedOpRunPtImpl( const framework::OpKernelType& kernel_type, const phi::ArgumentMappingFn* arg_map_fn, const phi::KernelSignature* default_kernel_signature, - const phi::KernelSignature& kernel_signature, const phi::Kernel& phi_kernel, - platform::DeviceContext* dev_ctx, const NameVarMap& ins, - const NameVarMap& outs, const framework::AttributeMap& attrs, + const phi::KernelSignature& kernel_signature, + const phi::Kernel& phi_kernel, + platform::DeviceContext* dev_ctx, + const NameVarMap& ins, + const NameVarMap& outs, + const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); - DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type, - arg_map_fn, default_kernel_signature); + 1, + platform::EventRole::kInnerOp); + DygraphInferShapeContext infer_shape_ctx(&ins, + &outs, + &attrs, + &default_attrs, + op.Type(), + &kernel_type, + arg_map_fn, + default_kernel_signature); op.Info().infer_shape_(&infer_shape_ctx); } { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); PreparePhiData(phi_kernel, kernel_signature, ins); phi::KernelContext pt_kernel_context; - BuildDygraphPhiKernelContext(kernel_signature, phi_kernel, ins, - outs, attrs, default_attrs, dev_ctx, + BuildDygraphPhiKernelContext(kernel_signature, + phi_kernel, + ins, + outs, + attrs, + default_attrs, + dev_ctx, &pt_kernel_context); phi_kernel(&pt_kernel_context); @@ -577,14 +639,29 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { - PreparedOpRunPtImpl(op_, kernel_type_, arg_map_fn_, - default_kernel_signature_, kernel_signature_, - phi_kernel_, dev_ctx_, ins, outs, attrs, + PreparedOpRunPtImpl(op_, + kernel_type_, + arg_map_fn_, + default_kernel_signature_, + kernel_signature_, + phi_kernel_, + dev_ctx_, + ins, + outs, + attrs, default_attrs); } else { - PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, arg_map_fn_, - default_kernel_signature_, dev_ctx_, ins, outs, - attrs, default_attrs); + PreparedOpRunImpl(op_, + ctx_, + kernel_type_, + func_, + arg_map_fn_, + default_kernel_signature_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } } @@ -593,14 +670,29 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { - PreparedOpRunPtImpl( - op_, kernel_type_, arg_map_fn_, default_kernel_signature_, - kernel_signature_, phi_kernel_, dev_ctx_, ins, outs, attrs, - default_attrs); + PreparedOpRunPtImpl(op_, + kernel_type_, + arg_map_fn_, + default_kernel_signature_, + kernel_signature_, + phi_kernel_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } else { - PreparedOpRunImpl( - op_, ctx_, kernel_type_, func_, arg_map_fn_, default_kernel_signature_, - dev_ctx_, ins, outs, attrs, default_attrs); + PreparedOpRunImpl(op_, + ctx_, + kernel_type_, + func_, + arg_map_fn_, + default_kernel_signature_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } } @@ -609,14 +701,29 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { - PreparedOpRunPtImpl( - op_, kernel_type_, arg_map_fn_, default_kernel_signature_, - kernel_signature_, phi_kernel_, dev_ctx_, ins, outs, attrs, - default_attrs); + PreparedOpRunPtImpl(op_, + kernel_type_, + arg_map_fn_, + default_kernel_signature_, + kernel_signature_, + phi_kernel_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } else { - PreparedOpRunImpl( - op_, ctx_, kernel_type_, func_, arg_map_fn_, default_kernel_signature_, - dev_ctx_, ins, outs, attrs, default_attrs); + PreparedOpRunImpl(op_, + ctx_, + kernel_type_, + func_, + arg_map_fn_, + default_kernel_signature_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 4c4ddf24dab68..5b1c4d3376cb8 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -181,15 +181,23 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( } template -void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, +void Tracer::TraceOp(const std::string& type, + const NameVarMap& ins, const NameVarMap& outs, framework::AttributeMap attrs, - const platform::Place& place, bool trace_backward, + const platform::Place& place, + bool trace_backward, const std::map& inplace_map, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { - TraceOpImpl(type, ins, outs, attrs, place, trace_backward, - inplace_map, passed_default_attrs_, + TraceOpImpl(type, + ins, + outs, + attrs, + place, + trace_backward, + inplace_map, + passed_default_attrs_, use_default_attr_map); } @@ -198,7 +206,8 @@ void Tracer::TraceOpImpl(const std::string& type, const NameVarMap& ins, const NameVarMap& outs, framework::AttributeMap& attrs, - const platform::Place& place, bool trace_backward, + const platform::Place& place, + bool trace_backward, const std::map& inplace_map, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { @@ -237,8 +246,9 @@ void Tracer::TraceOpImpl(const std::string& type, const auto& tracer = imperative::GetCurrentTracer(); VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type; ins_amp = std::make_unique>( - AutoCastInputs(type, imperative::AutoTuneLayout( - type, ins, outs, &attrs, tracer))); + AutoCastInputs(type, + imperative::AutoTuneLayout( + type, ins, outs, &attrs, tracer))); } else if (amp_dtype_ == phi::DataType::BFLOAT16) { VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type; ins_amp = std::make_unique>( @@ -248,10 +258,10 @@ void Tracer::TraceOpImpl(const std::string& type, if (amp_dtype_ == phi::DataType::FLOAT16) { const auto& tracer = imperative::GetCurrentTracer(); VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type; - ins_amp = - std::make_unique>(CastPureFp16Inputs( - type, imperative::AutoTuneLayout(type, ins, outs, &attrs, - tracer))); + ins_amp = std::make_unique>( + CastPureFp16Inputs(type, + imperative::AutoTuneLayout( + type, ins, outs, &attrs, tracer))); } else if (amp_dtype_ == phi::DataType::BFLOAT16) { VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type; ins_amp = std::make_unique>( @@ -316,10 +326,12 @@ void Tracer::TraceOpImpl(const std::string& type, framework::AppendErrorOpHint(type, &exception); throw std::move(exception); } catch (std::exception& ex) { - PADDLE_THROW(platform::errors::Fatal( - "Operator %s raises an %s exception.\n" - "The exception content is\n:%s.", - type, platform::demangle(typeid(ex).name()), ex.what())); + PADDLE_THROW( + platform::errors::Fatal("Operator %s raises an %s exception.\n" + "The exception content is\n:%s.", + type, + platform::demangle(typeid(ex).name()), + ex.what())); } catch (...) { // NOTE: this branch represents a very serious bug with // low probability of occurrence, and we can't get its @@ -339,13 +351,14 @@ void Tracer::TraceOpImpl(const std::string& type, if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { PADDLE_ENFORCE_EQ( - passed_default_attrs_, nullptr, + passed_default_attrs_, + nullptr, paddle::platform::errors::PermissionDenied( "We expect passed_default_attrs_ is nullptr while " "use_default_attr_map is true, however we got not null " "passed_default_attrs_. Please check your usage of trace_op. ")); - CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, - inplace_map); + CreateGradOpNode( + *op, new_ins, outs, attrs, default_attrs, place, inplace_map); } else { VLOG(3) << "No Grad to track for Op: " << type; } @@ -354,27 +367,43 @@ void Tracer::TraceOpImpl(const std::string& type, } template void Tracer::TraceOp( - const std::string& type, const NameVarMap& ins, - const NameVarMap& outs, framework::AttributeMap attrs, - const platform::Place& place, bool trace_backward, + const std::string& type, + const NameVarMap& ins, + const NameVarMap& outs, + framework::AttributeMap attrs, + const platform::Place& place, + bool trace_backward, const std::map& inplace_map, - paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map); + paddle::framework::AttributeMap* default_attrs, + bool use_default_attr_map); template void Tracer::TraceOp( - const std::string& type, const NameVarMap& ins, - const NameVarMap& outs, framework::AttributeMap attrs, - const platform::Place& place, bool trace_backward, + const std::string& type, + const NameVarMap& ins, + const NameVarMap& outs, + framework::AttributeMap attrs, + const platform::Place& place, + bool trace_backward, const std::map& inplace_map_, - paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map); + paddle::framework::AttributeMap* default_attrs, + bool use_default_attr_map); -void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs, +void Tracer::TraceOp(const std::string& type, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs, + framework::AttributeMap attrs, const std::map& inplace_map) { - TraceOp(type, ins, outs, std::move(attrs), expected_place_, - has_grad_, inplace_map); + TraceOp(type, + ins, + outs, + std::move(attrs), + expected_place_, + has_grad_, + inplace_map); } -void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, +void Tracer::TraceOp(const std::string& type, + const NameTensorMap& ins, const NameTensorMap& outs, paddle::framework::AttributeMap& attrs, const paddle::platform::Place& place, @@ -383,26 +412,41 @@ void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, const std::map& inplace_map) { VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: " << use_default_attr_map; - TraceOpImpl(type, ins, outs, attrs, place, false, - inplace_map, default_attrs, + TraceOpImpl(type, + ins, + outs, + attrs, + place, + false, + inplace_map, + default_attrs, use_default_attr_map); } -void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, +void Tracer::TraceOp(const std::string& type, + const NameTensorMap& ins, const NameTensorMap& outs, paddle::framework::AttributeMap attrs) { VLOG(6) << "Running On Eager TraceOp(4 agrs): "; - TraceOpImpl(type, ins, outs, attrs, expected_place_, - false, {}, nullptr, true); + TraceOpImpl( + type, ins, outs, attrs, expected_place_, false, {}, nullptr, true); } -void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, +void Tracer::TraceOp(const std::string& type, + const NameTensorMap& ins, const NameTensorMap& outs, paddle::framework::AttributeMap& attrs, const std::map& inplace_map) { VLOG(6) << "Running On Eager TraceOp(less): "; - TraceOpImpl(type, ins, outs, attrs, expected_place_, - false, inplace_map, nullptr, true); + TraceOpImpl(type, + ins, + outs, + attrs, + expected_place_, + false, + inplace_map, + nullptr, + true); } void Tracer::SetExpectedPlace(platform::Place place) { @@ -434,8 +478,10 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, } phi::KernelSignature Tracer::GetExpectedKernelSignature( - const std::string& type, const NameTensorMap& ins, - const NameTensorMap& outs, framework::AttributeMap attrs) const { + const std::string& type, + const NameTensorMap& ins, + const NameTensorMap& outs, + framework::AttributeMap attrs) const { auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); framework::RuntimeContext ctx({}, {}); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -451,11 +497,18 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature( : attr_checker->GetDefaultAttrMap(); auto dygraph_exe_ctx = imperative::DygraphExecutionContext( - *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, + *op, + framework::Scope(), + *dev_ctx, + ctx, + ins, + outs, + attrs, default_attrs); auto* opbase_with_kernel = dynamic_cast(op.get()); - PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr, + PADDLE_ENFORCE_NE(opbase_with_kernel, + nullptr, platform::errors::InvalidArgument( "This op type:`%s` is not a OperatorWithKernel, only " "OperatorWithKernel can get KernelSignature", diff --git a/paddle/fluid/operators/identity_loss_op.cc b/paddle/fluid/operators/identity_loss_op.cc index f2b28ba37d339..bc9986c7ffea1 100644 --- a/paddle/fluid/operators/identity_loss_op.cc +++ b/paddle/fluid/operators/identity_loss_op.cc @@ -91,14 +91,18 @@ DECLARE_INPLACE_OP_INFERER(IdentityLossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(identity_loss, IdentityLossInferShapeFunctor, +DECLARE_INFER_SHAPE_FUNCTOR(identity_loss, + IdentityLossInferShapeFunctor, PD_INFER_META(phi::IdentityLossInferMeta)); -REGISTER_OPERATOR(identity_loss, ops::IdentityLossOp, ops::IdentityLossOpMaker, +REGISTER_OPERATOR(identity_loss, + ops::IdentityLossOp, + ops::IdentityLossOpMaker, ops::IdentityLossGradMaker, ops::IdentityLossGradMaker, ops::IdentityLossInplaceInferer, IdentityLossInferShapeFunctor); -REGISTER_OPERATOR(identity_loss_grad, ops::IdentityLossGradOp, +REGISTER_OPERATOR(identity_loss_grad, + ops::IdentityLossGradOp, ops::IdentityLossGradInplaceInferer); diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index fbe3e1ea639d0..c75c36a278cb3 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -93,7 +93,9 @@ struct ConstantOpAttrVisitor : public boost::static_visitor { void operator()(const std::vector& vec) const { if (dtype_ == VarType::FP16) { std::vector vec_fp16; - std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16), + std::transform(vec.begin(), + vec.end(), + std::back_inserter(vec_fp16), [](float f) -> float16 { return float16(f); }); framework::TensorFromVector(vec_fp16, tensor_); } else { @@ -109,7 +111,9 @@ struct ConstantOpAttrVisitor : public boost::static_visitor { void operator()(const std::vector& vec) const { // popart do not support float64 constant std::vector vec_fp32; - std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp32), + std::transform(vec.begin(), + vec.end(), + std::back_inserter(vec_fp32), [](double f) -> float { return static_cast(f); }); framework::TensorFromVector(vec_fp32, tensor_); } @@ -362,7 +366,8 @@ void Compiler::InitOutputs(const std::vector& fetch_list) { for (const auto& fetch_name : fetch_list) { auto tensor = resources_->tensors.find(fetch_name); PADDLE_ENFORCE_NE( - tensor, resources_->tensors.end(), + tensor, + resources_->tensors.end(), platform::errors::NotFound( "Output tensor %s is not found, please check the model.", fetch_name)); @@ -429,8 +434,9 @@ void Compiler::LowerWeights(const Scope* scope) { VLOG(10) << "lowering weight: " << var_name; auto var = scope->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL( - var, platform::errors::NotFound("Tensor %s is not found in the scope", - var_name)); + var, + platform::errors::NotFound("Tensor %s is not found in the scope", + var_name)); auto tensor = var->Get(); auto dtype = PhiDType2PopartDType(tensor.dtype()); auto shape = std::vector(); @@ -482,9 +488,12 @@ void Compiler::LowerBody() { VLOG(10) << "Build graph from custom op: " << __op_type; auto it = custom_ops_.find(__op_type); NameScopeHelper ns_helper(op_desc, builder_.get()); - auto output_ids = - builder_->customOp(it->second.popart_op, it->second.popart_op.version, - inputs, outputs.size(), attributes, debug_context); + auto output_ids = builder_->customOp(it->second.popart_op, + it->second.popart_op.version, + inputs, + outputs.size(), + attributes, + debug_context); PostLower(output_ids, op_desc); } else if (op_type == "popart_printtensor") { auto inputs = GetOpInputs(op_desc); @@ -572,14 +581,17 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(momentum, true), popart::SGD::getUnsetDampening(), popart::SGD::getUnsetVelocityScaling(), - popart::OptimizerValue(loss_scaling, true), clip_norm_settings); + popart::OptimizerValue(loss_scaling, true), + clip_norm_settings); }; resources_->eval_optimizer = std::make_unique( popart::OptimizerValue(0.0, false), popart::OptimizerValue(0.0, false), - popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(), + popart::OptimizerValue(0.0, true), + popart::SGD::getUnsetDampening(), popart::SGD::getUnsetVelocityScaling(), - popart::OptimizerValue(loss_scaling, true), clip_norm_settings); + popart::OptimizerValue(loss_scaling, true), + clip_norm_settings); } else if (type == "adam") { auto weight_decay = BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); @@ -609,10 +621,15 @@ void Compiler::LowerOptimizer(const Scope* scope) { {"defaultEps", {eps, true}}, {"lossScaling", {loss_scaling, true}}, {"defaultMaxWeightNorm", {mwn, true}}}; - auto optimizer_instance = std::make_unique( - optimizer_value, adam_mode, weight_decay_mode, - popart::DataType::UNDEFINED, accl1_type, accl2_type, - clip_norm_settings, scaled_optimizer_state_); + auto optimizer_instance = + std::make_unique(optimizer_value, + adam_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + accl1_type, + accl2_type, + clip_norm_settings, + scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { optimizer_instance->insertSpecific( weight_decay_vars[i], @@ -629,9 +646,14 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(beta2, false), popart::OptimizerValue(eps, true), popart::OptimizerValue(loss_scaling, true), - popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, - popart::DataType::UNDEFINED, accl1_type, accl2_type, - clip_norm_settings, scaled_optimizer_state_); + popart::OptimizerValue(mwn, true), + adam_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + accl1_type, + accl2_type, + clip_norm_settings, + scaled_optimizer_state_); } }; if (adam_mode == popart::AdamMode::Lamb) { @@ -642,11 +664,15 @@ void Compiler::LowerOptimizer(const Scope* scope) { {"defaultEps", {eps, true}}, {"lossScaling", {loss_scaling, true}}, {"defaultMaxWeightNorm", {mwn, true}}}; - auto eval_optimizer = std::make_unique( - optimizer_value, adam_mode, weight_decay_mode, - popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings, - scaled_optimizer_state_); + auto eval_optimizer = + std::make_unique(optimizer_value, + adam_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + popart::DataType::FLOAT, + popart::DataType::FLOAT, + clip_norm_settings, + scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { eval_optimizer->insertSpecific(weight_decay_vars[i], {{"weightDecay", {0.0, false}}}); @@ -660,11 +686,15 @@ void Compiler::LowerOptimizer(const Scope* scope) { {"defaultEps", {eps, true}}, {"lossScaling", {loss_scaling, true}}, {"defaultMaxWeightNorm", {mwn, true}}}; - auto eval_optimizer = std::make_unique( - optimizer_value, adam_mode, weight_decay_mode, - popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings, - scaled_optimizer_state_); + auto eval_optimizer = + std::make_unique(optimizer_value, + adam_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + popart::DataType::FLOAT, + popart::DataType::FLOAT, + clip_norm_settings, + scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { eval_optimizer->insertSpecific(weight_decay_vars[i], {{"weightDecay", {0.0, false}}}); @@ -678,9 +708,13 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(beta2, false), popart::OptimizerValue(eps, true), popart::OptimizerValue(loss_scaling, true), - popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, - popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings, + popart::OptimizerValue(mwn, true), + adam_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + popart::DataType::FLOAT, + popart::DataType::FLOAT, + clip_norm_settings, scaled_optimizer_state_); } } else if (type == "adaptive") { @@ -705,9 +739,13 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(alpha, true), popart::OptimizerValue(momentum, true), popart::OptimizerValue(eps, true), - popart::OptimizerValue(loss_scaling, true), adaptive_mode, - weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, - accl2_type, accl3_type); + popart::OptimizerValue(loss_scaling, true), + adaptive_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + accl1_type, + accl2_type, + accl3_type); }; resources_->eval_optimizer = std::make_unique( popart::OptimizerValue(0.0, false), @@ -715,9 +753,12 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(alpha, true), popart::OptimizerValue(momentum, true), popart::OptimizerValue(eps, true), - popart::OptimizerValue(loss_scaling, true), adaptive_mode, - weight_decay_mode, popart::DataType::UNDEFINED, - popart::DataType::FLOAT, popart::DataType::FLOAT, + popart::OptimizerValue(loss_scaling, true), + adaptive_mode, + weight_decay_mode, + popart::DataType::UNDEFINED, + popart::DataType::FLOAT, + popart::DataType::FLOAT, popart::DataType::UNDEFINED); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -749,7 +790,8 @@ void Compiler::PostLower(const std::vector& tensor_ids, // Record output tensors auto pd_outs = GetOpOutputs(op_desc); PADDLE_ENFORCE_EQ( - pd_outs.size(), tensor_ids.size(), + pd_outs.size(), + tensor_ids.size(), platform::errors::Fatal("paddle and popart op have different outputs")); for (int i = 0; i < tensor_ids.size(); ++i) { resources_->tensors.emplace(pd_outs[i], tensor_ids[i]); @@ -763,13 +805,15 @@ void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc) { // Record output tensor auto pd_outs = GetOpOutputs(op_desc); PADDLE_ENFORCE_EQ( - pd_outs.size(), 1, + pd_outs.size(), + 1, platform::errors::Fatal("paddle and popart op have different outputs")); resources_->tensors.emplace(pd_outs[0], tensor_id); PostLower(tensor_id, op_desc, false); } -void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc, +void Compiler::PostLower(const std::string& tensor_id, + const OpDesc* op_desc, bool skip_pipline) { // Set pipline if (!skip_pipline && op_desc->HasAttr(sIpuIndexAttr)) { diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index 43c6866296a61..279914a0b931b 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -76,8 +76,10 @@ class PdIArray final : public popart::IArray { std::size_t rank() const { return tensor_.dims().size(); } int64_t dim(size_t index) const { return tensor_.dims().at(index); } std::size_t nelms() const { - return std::accumulate(shape_.begin(), shape_.end(), - static_cast(1), std::multiplies()); + return std::accumulate(shape_.begin(), + shape_.end(), + static_cast(1), + std::multiplies()); } const popart::Shape shape() const { return shape_; } @@ -108,14 +110,23 @@ void Executor::Prepare(const std::string &proto) { VLOG(10) << "Creating TrainingSession from Onnx Model..."; auto optimizer = compiler_resources_->NewOptimizer(); session_ = popart::TrainingSession::createFromOnnxModel( - proto, dataFlow, compiler_resources_->loss_var, *optimizer, device_, - popart::InputShapeInfo(), ipu_strategy_->popart_options, + proto, + dataFlow, + compiler_resources_->loss_var, + *optimizer, + device_, + popart::InputShapeInfo(), + ipu_strategy_->popart_options, ipu_strategy_->popart_patterns); } else { VLOG(10) << "Creating InferenceSession from Onnx Model..."; session_ = popart::InferenceSession::createFromOnnxModel( - proto, dataFlow, device_, popart::InputShapeInfo(), - ipu_strategy_->popart_options, ipu_strategy_->popart_patterns); + proto, + dataFlow, + device_, + popart::InputShapeInfo(), + ipu_strategy_->popart_options, + ipu_strategy_->popart_patterns); } VLOG(10) << "Creating session from Onnx Model...done"; @@ -287,8 +298,9 @@ void Executor::AcquireDevice() { popart::DeviceManager::createDeviceManager().acquireAvailableDevice( RequestIpus(ipu_strategy_->num_ipus)); PADDLE_ENFORCE_NOT_NULL( - device_, errors::Unavailable("Can't attach IPU, ipu_num = %d.", - RequestIpus(ipu_strategy_->num_ipus))); + device_, + errors::Unavailable("Can't attach IPU, ipu_num = %d.", + RequestIpus(ipu_strategy_->num_ipus))); VLOG(10) << "Create IPU device...done"; } VLOG(10) << "leave Executor::AcquireDevice"; @@ -377,19 +389,23 @@ void Executor::ConvertWeights(bool align_to_popart) { auto num_elem = info.nelms(); if (align_to_popart) { std::vector fp16_data; - std::transform(data_ptr, data_ptr + num_elem, + std::transform(data_ptr, + data_ptr + num_elem, std::back_inserter(fp16_data), [&](float elem) { return popart::floatToHalf(elem); }); - memcpy(reinterpret_cast(data_ptr), fp16_data.data(), + memcpy(reinterpret_cast(data_ptr), + fp16_data.data(), num_elem * sizeof(float16)); } else { std::vector fp32_data; auto fp16_data_ptr = reinterpret_cast(data_ptr); - std::transform(fp16_data_ptr, fp16_data_ptr + num_elem, - std::back_inserter(fp32_data), [&](uint16_t elem) { - return popart::halfToFloat(elem); - }); - memcpy(reinterpret_cast(data_ptr), fp32_data.data(), + std::transform( + fp16_data_ptr, + fp16_data_ptr + num_elem, + std::back_inserter(fp32_data), + [&](uint16_t elem) { return popart::halfToFloat(elem); }); + memcpy(reinterpret_cast(data_ptr), + fp32_data.data(), num_elem * sizeof(float)); } } else { diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index 708ea13177e7b..5acd075a6155f 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -19,7 +19,8 @@ namespace { template void RegisterSetter( std::map>& options, // NOLINT - const std::string& name, Lambda setter) { + const std::string& name, + Lambda setter) { options[name] = setter; } @@ -27,7 +28,9 @@ template void RegisterGetter( std::map>& options, // NOLINT std::map& options_type, // NOLINT - const std::string& name, const std::string& type_str, Lambda getter) { + const std::string& name, + const std::string& type_str, + Lambda getter) { options[name] = getter; options_type[name] = type_str; } @@ -55,25 +58,28 @@ namespace ipu { IpuStrategy::IpuStrategy() { #define ADD_BOOL_OPTION(name) \ RegisterSetter(bool_options, #name, [&](bool value) { name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "bool", \ - [&]() { return std::to_string(name); }) + RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ + return std::to_string(name); \ + }) -#define ADD_UINT64_OPTION(name) \ - RegisterSetter(uint64_options, #name, \ - [&](std::uint64_t value) { name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "uint64", \ - [&]() { return std::to_string(name); }) +#define ADD_UINT64_OPTION(name) \ + RegisterSetter( \ + uint64_options, #name, [&](std::uint64_t value) { name = value; }); \ + RegisterGetter(options_getter, options_type, #name, "uint64", [&]() { \ + return std::to_string(name); \ + }) #define ADD_DOUBLE_OPTION(name) \ RegisterSetter(double_options, #name, [&](double value) { name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "double", \ - [&]() { return std::to_string(name); }) + RegisterGetter(options_getter, options_type, #name, "double", [&]() { \ + return std::to_string(name); \ + }) -#define ADD_STRING_OPTION(name) \ - RegisterSetter(string_options, #name, \ - [&](const std::string& value) { name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "string", \ - [&]() { return name; }) +#define ADD_STRING_OPTION(name) \ + RegisterSetter( \ + string_options, #name, [&](const std::string& value) { name = value; }); \ + RegisterGetter( \ + options_getter, options_type, #name, "string", [&]() { return name; }) ADD_BOOL_OPTION(is_training); ADD_BOOL_OPTION(need_avg_shard); @@ -103,11 +109,12 @@ IpuStrategy::IpuStrategy() { #undef ADD_UINT64_OPTION #undef ADD_BOOL_OPTION -#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name) \ - RegisterSetter(bool_options, #name, \ - [&](bool value) { runtime_options.aliased_name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ - return std::to_string(runtime_options.aliased_name); \ +#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name) \ + RegisterSetter(bool_options, #name, [&](bool value) { \ + runtime_options.aliased_name = value; \ + }); \ + RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ + return std::to_string(runtime_options.aliased_name); \ }) ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval); @@ -117,7 +124,8 @@ IpuStrategy::IpuStrategy() { #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType) \ RegisterSetter(uint64_options, #name, [&](std::uint64_t value) { \ PADDLE_ENFORCE_LT( \ - value, static_cast(popart::EnumType::N), \ + value, \ + static_cast(popart::EnumType::N), \ errors::InvalidArgument("Value for %s out of range", #EnumType)); \ popart_options.aliased_name = static_cast(value); \ }); \ @@ -126,11 +134,12 @@ IpuStrategy::IpuStrategy() { static_cast(popart_options.aliased_name)); \ }) -#define ADD_POPART_BOOL_OPTION_ALIAS(name, aliased_name) \ - RegisterSetter(bool_options, #name, \ - [&](bool value) { popart_options.aliased_name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ - return std::to_string(popart_options.aliased_name); \ +#define ADD_POPART_BOOL_OPTION_ALIAS(name, aliased_name) \ + RegisterSetter(bool_options, #name, [&](bool value) { \ + popart_options.aliased_name = value; \ + }); \ + RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ + return std::to_string(popart_options.aliased_name); \ }) #define ADD_POPART_UINT64_OPTION_ALIAS(name, aliased_name) \ @@ -141,19 +150,21 @@ IpuStrategy::IpuStrategy() { return std::to_string(popart_options.aliased_name); \ }) -#define ADD_POPART_DOUBLE_OPTION_ALIAS(name, aliased_name) \ - RegisterSetter(double_options, #name, \ - [&](double value) { popart_options.aliased_name = value; }); \ - RegisterGetter(options_getter, options_type, #name, "double", [&]() { \ - return std::to_string(popart_options.aliased_name); \ +#define ADD_POPART_DOUBLE_OPTION_ALIAS(name, aliased_name) \ + RegisterSetter(double_options, #name, [&](double value) { \ + popart_options.aliased_name = value; \ + }); \ + RegisterGetter(options_getter, options_type, #name, "double", [&]() { \ + return std::to_string(popart_options.aliased_name); \ }) #define ADD_POPART_STRING_OPTION_ALIAS(name, aliased_name) \ RegisterSetter(string_options, #name, [&](const std::string& value) { \ popart_options.aliased_name = value; \ }); \ - RegisterGetter(options_getter, options_type, #name, "string", \ - [&]() { return popart_options.aliased_name; }) + RegisterGetter(options_getter, options_type, #name, "string", [&]() { \ + return popart_options.aliased_name; \ + }) ADD_POPART_ENUM_OPTION_ALIAS(autodiff_settings.stitch_strategy, autodiffSettings.stitchStrategy, @@ -167,14 +178,14 @@ IpuStrategy::IpuStrategy() { ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.batch_schedule, batchSerializationSettings.batchSchedule, BatchSerializationBatchSchedule); - ADD_POPART_ENUM_OPTION_ALIAS(auto_recomputation, autoRecomputation, - RecomputationType); - ADD_POPART_ENUM_OPTION_ALIAS(merge_var_update, mergeVarUpdate, - MergeVarUpdateType); - ADD_POPART_ENUM_OPTION_ALIAS(virtual_graph_mode, virtualGraphMode, - VirtualGraphMode); - ADD_POPART_ENUM_OPTION_ALIAS(synthetic_data_mode, syntheticDataMode, - SyntheticDataMode); + ADD_POPART_ENUM_OPTION_ALIAS( + auto_recomputation, autoRecomputation, RecomputationType); + ADD_POPART_ENUM_OPTION_ALIAS( + merge_var_update, mergeVarUpdate, MergeVarUpdateType); + ADD_POPART_ENUM_OPTION_ALIAS( + virtual_graph_mode, virtualGraphMode, VirtualGraphMode); + ADD_POPART_ENUM_OPTION_ALIAS( + synthetic_data_mode, syntheticDataMode, SyntheticDataMode); ADD_POPART_ENUM_OPTION_ALIAS(subgraph_copying_strategy, subgraphCopyingStrategy, SubgraphCopyingStrategy); @@ -183,7 +194,8 @@ IpuStrategy::IpuStrategy() { ReductionType); ADD_POPART_ENUM_OPTION_ALIAS( mean_accumulation_and_replication_reduction_strategy, - meanAccumulationAndReplicationReductionStrategy, MeanReductionStrategy); + meanAccumulationAndReplicationReductionStrategy, + MeanReductionStrategy); ADD_POPART_STRING_OPTION_ALIAS(log_dir, logDir); ADD_POPART_STRING_OPTION_ALIAS(cache_path, cachePath); @@ -309,14 +321,14 @@ IpuStrategy::IpuStrategy() { #undef ADD_POPART_BOOL_OPTION_ALIAS #undef ADD_POPART_ENUM_OPTION_ALIAS - RegisterGetter(vector_options_getter, options_type, "custom_ops", "vector", - [&]() { - std::vector res; - for (auto x : custom_ops) { - res.push_back(x.repr()); - } - return res; - }); + RegisterGetter( + vector_options_getter, options_type, "custom_ops", "vector", [&]() { + std::vector res; + for (auto x : custom_ops) { + res.push_back(x.repr()); + } + return res; + }); RegisterSetter(bool_options, "enable_manual_shard", [&](bool value) { if (value) { @@ -326,11 +338,11 @@ IpuStrategy::IpuStrategy() { } }); - RegisterGetter(options_getter, options_type, "enable_manual_shard", "bool", - [&]() { - return std::to_string(popart_options.virtualGraphMode == - popart::VirtualGraphMode::Manual); - }); + RegisterGetter( + options_getter, options_type, "enable_manual_shard", "bool", [&]() { + return std::to_string(popart_options.virtualGraphMode == + popart::VirtualGraphMode::Manual); + }); RegisterSetter(bool_options, "enable_half_partial", [&](bool value) { if (value) { @@ -347,10 +359,11 @@ IpuStrategy::IpuStrategy() { return std::to_string(popart_options.partialsTypeMatMuls == "half"); }); - RegisterSetter(container_options, "dot_checks", + RegisterSetter(container_options, + "dot_checks", [&](const std::pair& p) { - std::vector valid_dot{"Fwd0", "Fwd1", "Bwd0", - "PreAlias", "Final"}; + std::vector valid_dot{ + "Fwd0", "Fwd1", "Bwd0", "PreAlias", "Final"}; if (std::find(valid_dot.begin(), valid_dot.end(), p.first) == valid_dot.end()) { PADDLE_THROW(platform::errors::InvalidArgument( @@ -359,16 +372,17 @@ IpuStrategy::IpuStrategy() { popart_options.dotChecks.insert(p.first); }); - RegisterGetter(vector_options_getter, options_type, "dot_checks", "vector", - [&]() { - std::vector res; - for (auto x : popart_options.dotChecks) { - res.push_back(x); - } - return res; - }); + RegisterGetter( + vector_options_getter, options_type, "dot_checks", "vector", [&]() { + std::vector res; + for (auto x : popart_options.dotChecks) { + res.push_back(x); + } + return res; + }); - RegisterSetter(container_options, "hardware_instrumentations", + RegisterSetter(container_options, + "hardware_instrumentations", [&](const std::pair& p) { std::uint64_t value = std::stoul(p.first); popart_options.hardwareInstrumentations.insert( @@ -376,8 +390,11 @@ IpuStrategy::IpuStrategy() { }); RegisterGetter( - vector_options_getter, options_type, "hardware_instrumentations", - "vector", [&]() { + vector_options_getter, + options_type, + "hardware_instrumentations", + "vector", + [&]() { std::vector res; for (auto x : popart_options.hardwareInstrumentations) { res.push_back(std::to_string(static_cast(x))); @@ -385,59 +402,74 @@ IpuStrategy::IpuStrategy() { return res; }); - RegisterSetter(container_options, "custom_codelets", + RegisterSetter(container_options, + "custom_codelets", [&](const std::pair& p) { popart_options.customCodelets.push_back(p.first); }); - RegisterGetter(vector_options_getter, options_type, "custom_codelets", - "vector", [&]() { - std::vector res; - for (auto x : popart_options.customCodelets) { - res.push_back(x); - } - return res; - }); + RegisterGetter( + vector_options_getter, options_type, "custom_codelets", "vector", [&]() { + std::vector res; + for (auto x : popart_options.customCodelets) { + res.push_back(x); + } + return res; + }); - RegisterSetter(container_options, "engine_options", + RegisterSetter(container_options, + "engine_options", [&](const std::pair& p) { popart_options.engineOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "engine_options", "map", - [&]() { return popart_options.engineOptions; }); + RegisterGetter( + map_options_getter, options_type, "engine_options", "map", [&]() { + return popart_options.engineOptions; + }); - RegisterSetter(container_options, "report_options", + RegisterSetter(container_options, + "report_options", [&](const std::pair& p) { popart_options.reportOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "report_options", "map", - [&]() { return popart_options.reportOptions; }); + RegisterGetter( + map_options_getter, options_type, "report_options", "map", [&]() { + return popart_options.reportOptions; + }); - RegisterSetter(container_options, "convolution_options", + RegisterSetter(container_options, + "convolution_options", [&](const std::pair& p) { popart_options.convolutionOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "convolution_options", "map", - [&]() { return popart_options.convolutionOptions; }); + RegisterGetter( + map_options_getter, options_type, "convolution_options", "map", [&]() { + return popart_options.convolutionOptions; + }); - RegisterSetter(container_options, "lstm_options", + RegisterSetter(container_options, + "lstm_options", [&](const std::pair& p) { popart_options.lstmOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "lstm_options", "map", - [&]() { return popart_options.lstmOptions; }); + RegisterGetter( + map_options_getter, options_type, "lstm_options", "map", [&]() { + return popart_options.lstmOptions; + }); - RegisterSetter(container_options, "gcl_options", + RegisterSetter(container_options, + "gcl_options", [&](const std::pair& p) { popart_options.gclOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "gcl_options", "map", - [&]() { return popart_options.gclOptions; }); + RegisterGetter(map_options_getter, options_type, "gcl_options", "map", [&]() { + return popart_options.gclOptions; + }); // Default options @@ -465,15 +497,19 @@ void IpuStrategy::AddStringOption(const std::string& option, void IpuStrategy::InsertStringOption(const std::string& option, const std::string& value) { - set(option, std::pair(value, ""), container_options, + set(option, + std::pair(value, ""), + container_options, "vector"); } void IpuStrategy::InsertStringPairOption(const std::string& option, const std::string& key, const std::string& value) { - set(option, std::pair(key, value), - container_options, "map"); + set(option, + std::pair(key, value), + container_options, + "map"); } void IpuStrategy::SetTensorLocation(const std::string& tensor, @@ -553,7 +589,8 @@ void IpuStrategy::SetAccumulateOuterFragmentSettings( void IpuStrategy::AddCustomOp(const std::string& paddle_op, const std::string& popart_op, - const std::string& domain, int version) { + const std::string& domain, + int version) { LOG(INFO) << "IpuStrategy add custom op: " << paddle_op; custom_ops.push_back( IpuCustomOpIdentifier(paddle_op, popart_op, domain, version)); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 2b73b70b56ddf..7207e9199765f 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -121,15 +121,19 @@ class IpuStrategy { void AddDoubleOption(const std::string &option, double value); void AddStringOption(const std::string &option, const std::string &value); void InsertStringOption(const std::string &option, const std::string &value); - void InsertStringPairOption(const std::string &option, const std::string &key, + void InsertStringPairOption(const std::string &option, + const std::string &key, const std::string &value); - void SetTensorLocation(const std::string &tensor, const std::string &option, + void SetTensorLocation(const std::string &tensor, + const std::string &option, std::uint64_t value); void SetReplicatedCollectivesSettings(const std::string &opt, bool value); void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule, const std::vector &values); - void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, - const std::string &domain, int version); + void AddCustomOp(const std::string &paddle_op, + const std::string &popart_op, + const std::string &domain, + int version); void SetCompilationProgressLogger( const std::function &logger); @@ -146,15 +150,18 @@ class IpuStrategy { private: template void set( - const std::string &key, ValueType value, + const std::string &key, + ValueType value, std::map> &options, // NOLINT const std::string &type_str) { auto it = options.find(key); PADDLE_ENFORCE_NE( - it, options.end(), + it, + options.end(), platform::errors::InvalidArgument("Cannot find option: %s, type: %s " "when setting IpuStrategy options", - key, type_str)); + key, + type_str)); it->second(value); } @@ -164,7 +171,8 @@ class IpuStrategy { std::map> &options) { // NOLINT auto it = options.find(key); PADDLE_ENFORCE_NE( - it, options.end(), + it, + options.end(), platform::errors::InvalidArgument( "Cannot find option name: %s when trying to get IpuStrategy option", key)); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc index 997fd9be070cb..1e9291cf57256 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc @@ -25,8 +25,8 @@ Node *custom_op_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto attrs = op->GetAttrMap(); attrs.insert({"__op_type", node->Op()->Type()}); - auto new_node = CreateBaseOp(graph, node, "popart_custom_op", node->inputs, - node->outputs, attrs); + auto new_node = CreateBaseOp( + graph, node, "popart_custom_op", node->inputs, node->outputs, attrs); return new_node; } @@ -43,15 +43,15 @@ Node *print_handler(Graph *graph, Node *node) { } auto attrs = AttributeMap{{"print_gradient", print_gradient}, {"title", title}}; - return CreateBaseOp(graph, node, "popart_printtensor", node->inputs, - node->outputs, attrs); + return CreateBaseOp( + graph, node, "popart_printtensor", node->inputs, node->outputs, attrs); } Node *popart_optimizer_handler(Graph *graph, Node *node) { return nullptr; } Node *checkpointoutput_handler(Graph *graph, Node *node) { - return CreateBaseOp(graph, node, "popart_checkpointoutput", node->inputs, - node->outputs); + return CreateBaseOp( + graph, node, "popart_checkpointoutput", node->inputs, node->outputs); } Node *custom_nll_loss_handler(Graph *graph, Node *node) { @@ -61,12 +61,18 @@ Node *custom_nll_loss_handler(Graph *graph, Node *node) { auto inputIsLogProbability = BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability")); if (ignoreIndex == "None") { - return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs, + return CreateBaseOp(graph, + node, + "popart_nllloss_v2", + node->inputs, node->outputs, {{"reduction", reduction}, {"inputIsLogProbability", inputIsLogProbability}}); } else { - return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs, + return CreateBaseOp(graph, + node, + "popart_nllloss_v2", + node->inputs, node->outputs, {{"reduction", reduction}, {"ignoreIndex", std::atoi(ignoreIndex.c_str())}, @@ -75,20 +81,24 @@ Node *custom_nll_loss_handler(Graph *graph, Node *node) { } Node *identity_handler(Graph *graph, Node *node) { - return CreateBaseOp(graph, node, "popart_identity", node->inputs, - node->outputs); + return CreateBaseOp( + graph, node, "popart_identity", node->inputs, node->outputs); } Node *identity_loss_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction")); - return CreateBaseOp(graph, node, "popart_identity_loss", node->inputs, - node->outputs, {{"reduction", reduction}}); + return CreateBaseOp(graph, + node, + "popart_identity_loss", + node->inputs, + node->outputs, + {{"reduction", reduction}}); } Node *detach_handler(Graph *graph, Node *node) { - return CreateBaseOp(graph, node, "popart_detach_v2", node->inputs, - node->outputs); + return CreateBaseOp( + graph, node, "popart_detach_v2", node->inputs, node->outputs); } } // namespace diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f742b0716e284..6f6950f9cd1c4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -56,7 +56,8 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { auto& desired_dev_ctx = static_cast(dev_ctx); if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { - return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size, + return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), + size, phi::Stream(reinterpret_cast( desired_dev_ctx.stream()))); } else { @@ -226,11 +227,15 @@ template inline void EmplaceDeviceContext( std::map>>* place_to_device_context, - platform::Place place, bool disable_setting_default_stream_for_allocator) { + platform::Place place, + bool disable_setting_default_stream_for_allocator) { // lazy evaluation. i.e., only create device context at first `Get` place_to_device_context->emplace( - place, std::async(std::launch::deferred, CreateDeviceContext, - place, disable_setting_default_stream_for_allocator)); + place, + std::async(std::launch::deferred, + CreateDeviceContext, + place, + disable_setting_default_stream_for_allocator)); } void EmplaceDeviceContexts( @@ -239,7 +244,8 @@ void EmplaceDeviceContexts( const std::vector& places, bool disable_setting_default_stream_for_allocator) { PADDLE_ENFORCE_GT( - places.size(), 0, + places.size(), + 0, platform::errors::InvalidArgument("The number of platform places should " "be larger than 0. But received %d.", places.size())); @@ -253,17 +259,20 @@ void EmplaceDeviceContexts( if (platform::is_cpu_place(p)) { #ifdef PADDLE_WITH_MKLDNN EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #endif } else if (platform::is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW( @@ -273,7 +282,8 @@ void EmplaceDeviceContexts( } else if (platform::is_cuda_pinned_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -283,7 +293,8 @@ void EmplaceDeviceContexts( } else if (platform::is_xpu_place(p)) { #ifdef PADDLE_WITH_XPU EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW( @@ -293,7 +304,8 @@ void EmplaceDeviceContexts( } else if (platform::is_mlu_place(p)) { #ifdef PADDLE_WITH_MLU EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW( @@ -303,7 +315,8 @@ void EmplaceDeviceContexts( } else if (platform::is_ipu_place(p)) { #ifdef PADDLE_WITH_IPU EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW( @@ -313,7 +326,8 @@ void EmplaceDeviceContexts( } else if (platform::is_npu_place(p)) { #ifdef PADDLE_WITH_ASCEND_CL EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -323,7 +337,8 @@ void EmplaceDeviceContexts( } else if (platform::is_npu_pinned_place(p)) { #ifdef PADDLE_WITH_ASCEND_CL EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -334,7 +349,8 @@ void EmplaceDeviceContexts( } else if (platform::is_custom_place(p)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE EmplaceDeviceContext( - place_to_device_context, p, + place_to_device_context, + p, disable_setting_default_stream_for_allocator); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -348,7 +364,8 @@ void EmplaceDeviceContexts( DeviceContextPool::DeviceContextPool( const std::vector& places) { - EmplaceDeviceContexts(&device_contexts_, places, + EmplaceDeviceContexts(&device_contexts_, + places, /*disable_setting_default_stream_for_allocator=*/false); } @@ -403,8 +420,8 @@ NPUDeviceContext::~NPUDeviceContext() { } void NPUDeviceContext::Wait() const { - platform::RecordEvent record_event("NPUDeviceContext/wait", - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + "NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2); VLOG(4) << "NPU context(" << this << ") Wait"; stream_->Wait(); } @@ -847,7 +864,8 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { VLOG(3) << "Prevented Clearing DNNL cache. Updated " "block_next_cache_clearing_ : " << block_next_cache_clearing_; - PADDLE_ENFORCE_GE(block_next_cache_clearing_, 0, + PADDLE_ENFORCE_GE(block_next_cache_clearing_, + 0, platform::errors::InvalidArgument( "Cache clearing mark should be non-negative " ". But received %d.", diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index ce8b825b13fe8..569890fa25cd6 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -101,8 +101,8 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook { // 2. call hook and return PyObject *res = nullptr; try { - res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(), - nullptr); + res = PyObject_CallFunctionObjArgs( + py_func_, py::cast(tmp_varbase).ptr(), nullptr); } catch (platform::EnforceNotMet &e) { throw std::move(e); } catch (std::exception &e) { @@ -159,8 +159,10 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { } // only initialize varbase, but not its tensor. -static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name, - bool persistable = false, int stop_gradient = -1) { +static void InitVarBaseOnly(imperative::VarBase *self, + const std::string &name, + bool persistable = false, + int stop_gradient = -1) { auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName( "generated_tensor") : name; @@ -177,10 +179,13 @@ static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name, } // initialize varbase and its tensor. -static void InitVarBaseAndTensor( - imperative::VarBase *self, const py::array &array, - const platform::Place &place, const std::string &name, - bool persistable = false, bool zero_copy = false, int stop_gradient = -1) { +static void InitVarBaseAndTensor(imperative::VarBase *self, + const py::array &array, + const platform::Place &place, + const std::string &name, + bool persistable = false, + bool zero_copy = false, + int stop_gradient = -1) { InitVarBaseOnly(self, name, persistable, stop_gradient); auto *tensor = self->MutableVar()->GetMutable(); VLOG(4) << "zero_copy: " << zero_copy; @@ -191,8 +196,8 @@ static void InitVarBaseAndTensor( } else if (platform::is_gpu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_cuda_pinned_place(place)) { - SetTensorFromPyArray(tensor, array, place, - zero_copy); + SetTensorFromPyArray( + tensor, array, place, zero_copy); } else if (platform::is_npu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_ipu_place(place)) { @@ -200,8 +205,8 @@ static void InitVarBaseAndTensor( } else if (platform::is_mlu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_custom_place(place)) { - SetTensorFromPyArray(tensor, array, place, - zero_copy); + SetTensorFromPyArray( + tensor, array, place, zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " @@ -231,8 +236,8 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self, // ignored auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"]) : default_place; - InitVarBaseAndTensor(self, array, place, name, persistable, zero_copy, - stop_gradient); + InitVarBaseAndTensor( + self, array, place, name, persistable, zero_copy, stop_gradient); } else { InitVarBaseOnly(self, name, persistable, stop_gradient); } @@ -240,7 +245,8 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self, template static void InitVarBaseFromNumpyWithArg(imperative::VarBase *self, - const py::array &array, const P &place, + const py::array &array, + const P &place, bool persistable = false, bool zero_copy = false, std::string name = "", @@ -400,7 +406,8 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( } PADDLE_ENFORCE_EQ( - PyErr_Occurred(), nullptr, + PyErr_Occurred(), + nullptr, platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred())))); return result; } @@ -422,7 +429,8 @@ paddle::imperative::NameTensorMap ConvertToNameTensorMap( } PADDLE_ENFORCE_EQ( - PyErr_Occurred(), nullptr, + PyErr_Occurred(), + nullptr, platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred())))); return result; } @@ -430,7 +438,8 @@ paddle::imperative::NameTensorMap ConvertToNameTensorMap( template static void VarBaseCopy(std::shared_ptr &src, // NOLINT imperative::VarBase &dst, // NOLINT - const P &dst_device, const bool blocking) { + const P &dst_device, + const bool blocking) { if (dst.SharedVar()->IsEmpty()) { VLOG(3) << "deep copy Variable from " << src->Name() << " to " << dst.Name(); @@ -457,7 +466,8 @@ static void VarBaseCopy(std::shared_ptr &src, // NOLINT dst.MutableVar()->GetMutable(); dst_selected_rows->set_height(src_selected_rows.height()); dst_selected_rows->set_rows(src_selected_rows.rows()); - framework::TensorCopy(src_selected_rows.value(), dst_device, + framework::TensorCopy(src_selected_rows.value(), + dst_device, dst_selected_rows->mutable_value()); if (blocking) { platform::DeviceContextPool::Instance().Get(dst_device)->Wait(); @@ -493,7 +503,8 @@ void BindImperative(py::module *m_ptr) { // Dygraph DataLoader signal handler m.def("_set_process_pids", [](int64_t key, py::object &obj) { PADDLE_ENFORCE_EQ( - py::isinstance(obj) || py::isinstance(obj), true, + py::isinstance(obj) || py::isinstance(obj), + true, platform::errors::InvalidArgument( "The subprocess ids set in DataLoader is illegal." "Expected data type is tuple or list, but received %s", @@ -528,7 +539,8 @@ void BindImperative(py::module *m_ptr) { // 1. cast to python array auto array = batch[i].cast(); PADDLE_ENFORCE_NE( - string::Sprintf("%s", array.dtype()).compare("object"), 0, + string::Sprintf("%s", array.dtype()).compare("object"), + 0, platform::errors::InvalidArgument( "Faild to convert input data to a regular ndarray.\n * " "Usually this means the input data contains nested " @@ -537,8 +549,8 @@ void BindImperative(py::module *m_ptr) { "_generator' to locate the data causes this issue.")); // 2. construcct LoDTensor framework::LoDTensor t; - SetTensorFromPyArray(&t, array, - platform::CPUPlace(), true); + SetTensorFromPyArray( + &t, array, platform::CPUPlace(), true); // 3. allocate shared memory void *data_ptr = t.data(); size_t data_size = t.numel() * framework::DataTypeSize(t.dtype()); @@ -548,8 +560,11 @@ void BindImperative(py::module *m_ptr) { const std::string &ipc_name = shared_writer_holder->ipc_name(); memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); // 5. copy data & reset holder - memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); + memory::Copy(platform::CPUPlace(), + shared_writer_holder->ptr(), + platform::CPUPlace(), + data_ptr, + data_size); t.ResetHolder(shared_writer_holder); // 6. append to result list tensors.append(t); @@ -564,7 +579,8 @@ void BindImperative(py::module *m_ptr) { // 1. cast to python array auto array = obj.cast(); PADDLE_ENFORCE_NE( - string::Sprintf("%s", array.dtype()).compare("object"), 0, + string::Sprintf("%s", array.dtype()).compare("object"), + 0, platform::errors::InvalidArgument( "Faild to convert input data to a regular ndarray.\n * " "Usually this means the input data contains nested " @@ -573,8 +589,8 @@ void BindImperative(py::module *m_ptr) { "_generator' to locate the data causes this issue.")); // 2. construcct LoDTensor framework::LoDTensor t; - SetTensorFromPyArray(&t, array, - platform::CPUPlace(), true); + SetTensorFromPyArray( + &t, array, platform::CPUPlace(), true); // 3. allocate shared memory void *data_ptr = t.data(); size_t data_size = t.numel() * framework::DataTypeSize(t.dtype()); @@ -584,8 +600,11 @@ void BindImperative(py::module *m_ptr) { const std::string &ipc_name = shared_writer_holder->ipc_name(); memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); // 5. copy data & reset holder - memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); + memory::Copy(platform::CPUPlace(), + shared_writer_holder->ptr(), + platform::CPUPlace(), + data_ptr, + data_size); t.ResetHolder(shared_writer_holder); return t; @@ -640,9 +659,12 @@ void BindImperative(py::module *m_ptr) { new (&self) imperative::VarBase(name); }) .def("__init__", - [](imperative::VarBase &self, framework::proto::VarType::Type dtype, - const std::vector &dims, const py::handle &name, - framework::proto::VarType::Type type, bool persistable) { + [](imperative::VarBase &self, + framework::proto::VarType::Type dtype, + const std::vector &dims, + const py::handle &name, + framework::proto::VarType::Type type, + bool persistable) { VLOG(4) << "Init VarBase"; std::string act_name = ""; if (!name.ptr() || name.ptr() == Py_None) { @@ -661,55 +683,107 @@ void BindImperative(py::module *m_ptr) { tensor->Resize(phi::make_ddim(dims)); } }) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) - .def("__init__", &InitVarBaseFromNumpyWithArg, - py::arg("value"), py::arg("place"), py::arg("persistable") = false, - py::arg("zero_copy") = false, py::arg("name") = "", + .def("__init__", + &InitVarBaseFromNumpyWithArg, + py::arg("value"), + py::arg("place"), + py::arg("persistable") = false, + py::arg("zero_copy") = false, + py::arg("name") = "", py::arg("stop_gradient") = -1) .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value")) - .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"), + .def("__init__", + &InitVarBaseFromTensorWithArgDefault, + py::arg("tensor"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), + py::arg("name") = "") + .def("__init__", + &InitVarBaseFromTensorWithArg, + py::arg("tensor"), + py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") - .def("__init__", &InitVarBaseFromTensorWithArg, - py::arg("tensor"), py::arg("place"), py::arg("name") = "") .def("__init__", &InitVarBaseFromNumpyWithKwargs) .def( "__setitem_varbase__", - [](std::shared_ptr &self, py::handle _index, + [](std::shared_ptr &self, + py::handle _index, py::object &value_obj) { VLOG(4) << "Call __setitem_varbase__"; @@ -767,9 +841,16 @@ void BindImperative(py::module *m_ptr) { none_axes, infer_flags, list_select_idxs; // if index is a list, list_select_flag will be true bool list_select_flag = false; - ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends, - &steps, &decrease_axes, &none_axes, - &infer_flags, &list_select_idxs, + ParseIndexingSlice(self_tensor, + index_ptr, + &axes, + &starts, + &ends, + &steps, + &decrease_axes, + &none_axes, + &infer_flags, + &list_select_idxs, &list_select_flag); framework::AttributeMap attrs = {{"axes", axes}, @@ -786,7 +867,8 @@ void BindImperative(py::module *m_ptr) { if (tracer->HasGrad()) { PADDLE_ENFORCE_EQ( - self->IsLeaf() && !self->OverridedStopGradient(), false, + self->IsLeaf() && !self->OverridedStopGradient(), + false, platform::errors::InvalidArgument( "Leaf Tensor (%s) that doesn't stop gradient can't use " "inplace strategy.", @@ -844,7 +926,9 @@ void BindImperative(py::module *m_ptr) { SetTensorFromPyArray(value_tensor->MutableVar() ->GetMutable(), - value, self->Place(), false); + value, + self->Place(), + false); ins.insert({"ValueTensor", {value_tensor}}); } else { @@ -892,7 +976,10 @@ void BindImperative(py::module *m_ptr) { { // Release gil and do tracing py::gil_scoped_release release; - tracer->TraceOp("set_value", ins, outs, std::move(attrs), + tracer->TraceOp("set_value", + ins, + outs, + std::move(attrs), {{"Input", "Out"}}); } } else { @@ -910,8 +997,8 @@ void BindImperative(py::module *m_ptr) { VLOG(4) << "index is not tensor"; self_numpy[_index] = value_obj; } - SetTensorFromPyArray(self_tensor, self_numpy, - self_tensor->place(), false); + SetTensorFromPyArray( + self_tensor, self_numpy, self_tensor->place(), false); } }) .def("_getitem_index_not_tensor", @@ -924,10 +1011,17 @@ void BindImperative(py::module *m_ptr) { bool list_select_flag = false; auto tensor = self->MutableVar()->GetMutable(); - ParseIndexingSlice(tensor, _index.ptr(), &slice_axes, - &slice_starts, &slice_ends, &slice_strides, - &decrease_axis, &none_axes, &infer_flags, - &list_select_idxs, &list_select_flag); + ParseIndexingSlice(tensor, + _index.ptr(), + &slice_axes, + &slice_starts, + &slice_ends, + &slice_strides, + &decrease_axis, + &none_axes, + &infer_flags, + &list_select_idxs, + &list_select_flag); // release gil and do tracing py::gil_scoped_release release; const auto &tracer = imperative::GetCurrentTracer(); @@ -1008,8 +1102,8 @@ void BindImperative(py::module *m_ptr) { ->GetMutable(); auto *dev_ctx = platform::DeviceContextPool::Instance().Get( tracer->ExpectedPlace()); - paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx, - idx_tensor); + paddle::framework::TensorFromVector( + list_select_idxs, *dev_ctx, idx_tensor); imperative::NameVarBaseMap ins = {{"X", {self}}, {"Index", {select_index}}}; @@ -1024,7 +1118,8 @@ void BindImperative(py::module *m_ptr) { [](std::shared_ptr &self, const py::args &args) { const auto &tensor = self->Var().Get(); PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, + tensor.IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor of %s is Empty, please check if it has no data.", self->Name())); @@ -1043,28 +1138,34 @@ void BindImperative(py::module *m_ptr) { size_t offset = 0; if (args.empty()) { PADDLE_ENFORCE_EQ( - numel, 1, + numel, + 1, platform::errors::InvalidArgument( "only one element tensors can be converted to Python " "scalars when no input coordinates")); } else if (args.size() == 1) { offset = args[0].cast(); PADDLE_ENFORCE_LT( - offset, numel, + offset, + numel, platform::errors::InvalidArgument( "index %d is out of bounds for size %d", offset, numel)); } else { - PADDLE_ENFORCE_EQ(args.size(), dims.size(), + PADDLE_ENFORCE_EQ(args.size(), + dims.size(), platform::errors::InvalidArgument( "incorrect number of indices for Tensor")); for (size_t i = 0; i < args.size(); ++i) { size_t index = args[i].cast(); PADDLE_ENFORCE_LT( - index, dims[i], + index, + dims[i], platform::errors::InvalidArgument( "index %d is out fo bounds for axis %d with size %d", - index, i, dims[i])); + index, + i, + dims[i])); offset += index * strides[i]; } } @@ -1072,8 +1173,8 @@ void BindImperative(py::module *m_ptr) { if (framework::TransToProtoVarType(tensor.dtype()) == proto_type) { \ std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \ T b = TensorGetElement(tensor, offset); \ - return py::array(py::dtype(py_dtype_str.c_str()), {}, {}, \ - static_cast(&b)); \ + return py::array( \ + py::dtype(py_dtype_str.c_str()), {}, {}, static_cast(&b)); \ } _ForEachDataType_(TENSOR_TO_PY_SCALAR); @@ -1086,7 +1187,8 @@ void BindImperative(py::module *m_ptr) { [](imperative::VarBase &self) -> uint32_t { const auto &var = self.MutableVar(); PADDLE_ENFORCE_EQ( - var->IsInitialized(), true, + var->IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor of %s is Empty, please check if it has no data.", self.Name())); @@ -1111,7 +1213,8 @@ void BindImperative(py::module *m_ptr) { [](imperative::VarBase &self) -> py::array { const auto &tensor = self.MutableVar()->Get(); PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, + tensor.IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor of %s is Empty, please check if it has no data.", self.Name())); @@ -1142,7 +1245,8 @@ void BindImperative(py::module *m_ptr) { [](const imperative::VarBase &self) -> std::shared_ptr { PADDLE_ENFORCE_EQ( - self.Var().IsInitialized(), true, + self.Var().IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self.Name())); @@ -1165,7 +1269,8 @@ void BindImperative(py::module *m_ptr) { const auto &origin_tensor = self.Var().Get(); PADDLE_ENFORCE_EQ( - origin_tensor.IsInitialized(), true, + origin_tensor.IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self.Name())); @@ -1181,7 +1286,8 @@ void BindImperative(py::module *m_ptr) { const auto &origin_selected_rows = self.Var().Get(); PADDLE_ENFORCE_EQ( - origin_selected_rows.value().IsInitialized(), true, + origin_selected_rows.value().IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self.Name())); @@ -1199,7 +1305,8 @@ void BindImperative(py::module *m_ptr) { << ") share data with " << self.Name(); return detach_var; }, - py::return_value_policy::take_ownership, R"DOC( + py::return_value_policy::take_ownership, + R"DOC( Returns a new Tensor, detached from the current graph. It will share data with origin Tensor and always doesn't have a Tensor copy. @@ -1237,8 +1344,10 @@ void BindImperative(py::module *m_ptr) { # one of the variables needed for gradient computation has been modified by an inplace operation. )DOC") - .def("clear_gradient", &imperative::VarBase::ClearGradient, - py::arg("set_to_zero") = true, R"DOC( + .def("clear_gradient", + &imperative::VarBase::ClearGradient, + py::arg("set_to_zero") = true, + R"DOC( Only for Tensor that has gradient, normally we use this for Parameters since other temporary Tensor doesen't has gradient. @@ -1258,14 +1367,16 @@ void BindImperative(py::module *m_ptr) { linear.weight.clear_gradient() print("After clear_gradient, linear.weight.grad: {}".format(linear.weight.grad)) )DOC") - .def("_gradient_set_empty", &imperative::VarBase::_GradientSetEmpty, + .def("_gradient_set_empty", + &imperative::VarBase::_GradientSetEmpty, py::arg("set_is_empty") = true) .def("_is_gradient_set_empty", &imperative::VarBase::_IsGradientSetEmpty) .def( "clone", [](std::shared_ptr &self) { const auto &tensor = self->Var().Get(); - PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true, + PADDLE_ENFORCE_EQ(tensor.IsInitialized(), + true, platform::errors::InvalidArgument( "%s has not been initialized", self->Name())); auto tracer = imperative::GetCurrentTracer(); @@ -1277,7 +1388,8 @@ void BindImperative(py::module *m_ptr) { tracer->TraceOp("assign", ins, outs, attrs); return new_var; }, - py::return_value_policy::copy, R"DOC( + py::return_value_policy::copy, + R"DOC( Returns a new Tensor, which is clone of origin Tensor, and it remains in the current graph. It will always have a Tensor copy. @@ -1402,7 +1514,8 @@ void BindImperative(py::module *m_ptr) { .def("_register_grad_hook", [](imperative::VarBase &self, const py::handle &hook) { PADDLE_ENFORCE_EQ( - !self.OverridedStopGradient() && self.HasGradVar(), true, + !self.OverridedStopGradient() && self.HasGradVar(), + true, platform::errors::InvalidArgument( "Cannot register gradient hook on a Tensor that stop " "gradient or without gradient.")); @@ -1412,7 +1525,8 @@ void BindImperative(py::module *m_ptr) { .def("_remove_grad_hook", [](imperative::VarBase &self, int64_t hook_id) { PADDLE_ENFORCE_EQ( - !self.OverridedStopGradient() && self.HasGradVar(), true, + !self.OverridedStopGradient() && self.HasGradVar(), + true, platform::errors::InvalidArgument( "Cannot remove gradient hook on a Tensor that stop " "gradient or without gradient.")); @@ -1421,7 +1535,8 @@ void BindImperative(py::module *m_ptr) { .def("_register_void_function_post_hook", [](imperative::VarBase &self, const py::handle &hook) { PADDLE_ENFORCE_EQ( - !self.OverridedStopGradient() && self.HasGradVar(), true, + !self.OverridedStopGradient() && self.HasGradVar(), + true, platform::errors::InvalidArgument( "Cannot register void function post hook on a Tensor that " "stop " @@ -1437,11 +1552,13 @@ void BindImperative(py::module *m_ptr) { "_register_backward_hook", [](imperative::VarBase &self, const py::handle &hook) { PADDLE_ENFORCE_EQ( - self.IsLeaf(), true, + self.IsLeaf(), + true, platform::errors::InvalidArgument( "Only can register backward hook for leaf Tensor.")); PADDLE_ENFORCE_EQ( - !self.OverridedStopGradient() && self.HasGradVar(), true, + !self.OverridedStopGradient() && self.HasGradVar(), + true, platform::errors::InvalidArgument( "Cannot register backward hook on a Tensor that stop " "gradient or without gradient.")); @@ -1534,7 +1651,8 @@ void BindImperative(py::module *m_ptr) { .def( "cuda", [](const std::shared_ptr &self, - py::handle &handle, bool blocking) { + py::handle &handle, + bool blocking) { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot copy this Tensor to GPU in CPU version Paddle, " @@ -1576,7 +1694,9 @@ void BindImperative(py::module *m_ptr) { } #endif }, - py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC( + py::arg("device_id") = py::none(), + py::arg("blocking") = true, + R"DOC( Returns a copy of this Tensor in GPU memory. If this Tensor is already in GPU memory and device_id is default, @@ -1610,7 +1730,8 @@ void BindImperative(py::module *m_ptr) { [](const std::shared_ptr &self) { #ifndef _WIN32 PADDLE_ENFORCE_EQ( - platform::is_cpu_place(self->Place()), true, + platform::is_cpu_place(self->Place()), + true, platform::errors::InvalidArgument( "Sharing memory only support CPU Tensor currently")); // 1. get LoDTensor @@ -1627,8 +1748,11 @@ void BindImperative(py::module *m_ptr) { const std::string &ipc_name = shared_writer_holder->ipc_name(); memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); // 4. copy data & reset holder - memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); + memory::Copy(platform::CPUPlace(), + shared_writer_holder->ptr(), + platform::CPUPlace(), + data_ptr, + data_size); t->ResetHolder(shared_writer_holder); return *t; #else @@ -1641,7 +1765,8 @@ void BindImperative(py::module *m_ptr) { .def( "_uva", [](const std::shared_ptr &self, int device_id) { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true, + PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), + true, platform::errors::InvalidArgument( "Unified virtual addressing only support " "CPU Tensor currently.")); @@ -1649,7 +1774,9 @@ void BindImperative(py::module *m_ptr) { self->MutableVar()->GetMutable(); tensor_uva(self_tensor, device_id); }, - py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC( + py::arg("device_id") = 0, + py::return_value_policy::reference, + R"DOC( Returns self tensor with the UVA(unified virtual addressing). Args: @@ -1669,7 +1796,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::CPUPlace &place, bool blocking) { + const platform::CPUPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to // copy data from the tensor of self to the tensor of new varbase, @@ -1688,7 +1816,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::CUDAPinnedPlace &place, bool blocking) { + const platform::CUDAPinnedPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1699,7 +1828,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::XPUPlace &place, bool blocking) { + const platform::XPUPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1710,7 +1840,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::CUDAPlace &place, bool blocking) { + const platform::CUDAPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1721,7 +1852,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::NPUPlace &place, bool blocking) { + const platform::NPUPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1732,7 +1864,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::IPUPlace &place, bool blocking) { + const platform::IPUPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1743,7 +1876,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::MLUPlace &place, bool blocking) { + const platform::MLUPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1754,7 +1888,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::CustomPlace &place, bool blocking) { + const platform::CustomPlace &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1765,7 +1900,8 @@ void BindImperative(py::module *m_ptr) { .def( "_copy_to", [](const std::shared_ptr &self, - const platform::Place &place, bool blocking) { + const platform::Place &place, + bool blocking) { auto new_var = self->NewVarBase(place, blocking); if (!blocking) { IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); @@ -1774,13 +1910,15 @@ void BindImperative(py::module *m_ptr) { }, py::return_value_policy::copy) .def( - "value", [](imperative::VarBase &self) { return self.MutableVar(); }, + "value", + [](imperative::VarBase &self) { return self.MutableVar(); }, py::return_value_policy::reference) .def("_clear", [](const std::shared_ptr &self) { auto *t = self->MutableVar()->GetMutable(); PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self->Name())); t->clear(); @@ -1789,7 +1927,8 @@ void BindImperative(py::module *m_ptr) { [](const std::shared_ptr &self) { auto *t = self->MutableVar()->GetMutable(); PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self->Name())); return t->offset(); @@ -1800,7 +1939,8 @@ void BindImperative(py::module *m_ptr) { auto *src = self->MutableVar()->GetMutable(); auto *dst_ = dst->MutableVar()->GetMutable(); PADDLE_ENFORCE_EQ( - src->IsInitialized(), true, + src->IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self->Name())); dst_->ShareBufferWith(*src); @@ -1822,7 +1962,8 @@ void BindImperative(py::module *m_ptr) { auto *src = self->MutableVar()->GetMutable(); auto *dst_ = dst->MutableVar()->GetMutable(); PADDLE_ENFORCE_EQ( - src->IsInitialized(), true, + src->IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self->Name())); dst_->ShareBufferWith(*src); @@ -1841,10 +1982,12 @@ void BindImperative(py::module *m_ptr) { }) .def("_slice", [](const std::shared_ptr &self, - int64_t begin_idx, int64_t end_idx) { + int64_t begin_idx, + int64_t end_idx) { auto *t = self->MutableVar()->GetMutable(); PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self->Name())); return t->Slice(begin_idx, end_idx); @@ -1880,12 +2023,13 @@ void BindImperative(py::module *m_ptr) { x = paddle.to_tensor(1, dtype='complex128') x.element_size() # 16 )DOC") - .def_property("name", &imperative::VarBase::Name, - &imperative::VarBase::SetName) + .def_property( + "name", &imperative::VarBase::Name, &imperative::VarBase::SetName) .def_property("stop_gradient", &imperative::VarBase::OverridedStopGradient, &imperative::VarBase::SetOverridedStopGradient) - .def_property("persistable", &imperative::VarBase::Persistable, + .def_property("persistable", + &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) .def_property_readonly( "shape", @@ -1909,7 +2053,8 @@ void BindImperative(py::module *m_ptr) { return std::vector(); } }) - .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf, + .def_property_readonly("is_leaf", + &imperative::VarBase::IsLeaf, R"DOC( Whether a Tensor is leaf Tensor. @@ -1939,7 +2084,8 @@ void BindImperative(py::module *m_ptr) { print(y.is_leaf) # False )DOC") .def_property_readonly( - "place", [](imperative::VarBase &self) { return self.Place(); }, + "place", + [](imperative::VarBase &self) { return self.Place(); }, py::return_value_policy::copy) .def_property_readonly("_place_str", [](imperative::VarBase &self) { @@ -1969,11 +2115,14 @@ void BindImperative(py::module *m_ptr) { .def_property("_enable_program_desc_tracing", &imperative::Tracer::IsProgramDescTracingEnabled, &imperative::Tracer::SetEnableProgramDescTracing) - .def_property("_amp_level", &imperative::Tracer::GetAmpLevel, + .def_property("_amp_level", + &imperative::Tracer::GetAmpLevel, &imperative::Tracer::SetAmpLevel) - .def_property("_amp_dtype", &imperative::Tracer::GetAmpDtype, + .def_property("_amp_dtype", + &imperative::Tracer::GetAmpDtype, &imperative::Tracer::SetAmpDtype) - .def_property("_has_grad", &imperative::Tracer::HasGrad, + .def_property("_has_grad", + &imperative::Tracer::HasGrad, &imperative::Tracer::SetHasGrad) .def_property( "_expected_place", @@ -2039,7 +2188,8 @@ void BindImperative(py::module *m_ptr) { .def("_get_program_desc_tracer", &imperative::Tracer::GetProgramDescTracer, py::return_value_policy::reference) - .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName, + .def("_generate_unique_name", + &imperative::Tracer::GenerateUniqueName, py::arg("key") = "dygraph_tmp") .def("_set_amp_op_list", [](imperative::Tracer &self, @@ -2067,8 +2217,10 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetMutableBlockOps())); }) .def("_get_kernel_signature", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, framework::AttributeMap attrs) { // TODO(xiongkun): move this function outside of tracer. auto ins_map = ConvertToNameTensorMap(ins); @@ -2086,118 +2238,167 @@ void BindImperative(py::module *m_ptr) { [](paddle::small_vector &vec) { return std::vector(vec.begin(), vec.end()); }; - auto ret = self.GetExpectedKernelSignature(type, ins_map, - outs_map, attrs); + auto ret = self.GetExpectedKernelSignature( + type, ins_map, outs_map, attrs); auto kernelsig_ins = input_to_vector(ret.input_names); auto kernelsig_attrs = attr_to_vector(ret.attr_names); auto kernelsig_outs = output_to_vector(ret.output_names); - return std::make_tuple(kernelsig_ins, kernelsig_attrs, - kernelsig_outs); + return std::make_tuple( + kernelsig_ins, kernelsig_attrs, kernelsig_outs); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::CustomPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::CustomPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::XPUPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::XPUPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::CUDAPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::CUDAPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::NPUPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::NPUPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::IPUPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::IPUPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::MLUPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::MLUPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }) .def("trace", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs, const platform::CPUPlace &place, + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::CPUPlace &place, bool trace_backward, const std::map &inplace_map = {}) { auto ins_map = ConvertToNameVarBaseMap(ins); auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.TraceOp( - type, std::move(ins_map), std::move(outs_map), - std::move(attrs), place, trace_backward, inplace_map); + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); } }); @@ -2258,11 +2459,20 @@ void BindImperative(py::module *m_ptr) { &output_targets, const std::vector> &output_grads, const std::vector> &no_grad_vars, - const platform::Place &place, bool create_graph, bool retain_graph, - bool allow_unused, bool only_inputs) { - imperative::PartialGradEngine engine( - input_targets, output_targets, output_grads, no_grad_vars, place, - create_graph, retain_graph, allow_unused, only_inputs); + const platform::Place &place, + bool create_graph, + bool retain_graph, + bool allow_unused, + bool only_inputs) { + imperative::PartialGradEngine engine(input_targets, + output_targets, + output_grads, + no_grad_vars, + place, + create_graph, + retain_graph, + allow_unused, + only_inputs); engine.Execute(); return engine.GetResult(); }, @@ -2272,7 +2482,8 @@ void BindImperative(py::module *m_ptr) { "dygraph_run_backward", [](const std::vector> &tensors, const std::vector> &grad_tensors, - bool retain_graph, const imperative::Tracer &tracer) { + bool retain_graph, + const imperative::Tracer &tracer) { auto *engine = tracer.GetEngine(); engine->Init(tensors, grad_tensors, retain_graph); VLOG(3) << "Start backward"; @@ -2294,11 +2505,16 @@ void BindImperative(py::module *m_ptr) { const std::vector> &, const std::vector &, std::shared_ptr, - const std::vector &, bool>()) - .def("prepare_for_backward", &imperative::Reducer::PrepareForBackward, - py::arg("vars"), py::call_guard()); - - m.def("assign_group_by_size", &imperative::AssignGroupBySize, py::arg("vars"), + const std::vector &, + bool>()) + .def("prepare_for_backward", + &imperative::Reducer::PrepareForBackward, + py::arg("vars"), + py::call_guard()); + + m.def("assign_group_by_size", + &imperative::AssignGroupBySize, + py::arg("vars"), py::arg("is_sparse_gradient"), py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, py::arg("tensor_indices") = std::vector{}, @@ -2306,7 +2522,8 @@ void BindImperative(py::module *m_ptr) { #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - py::class_>( m, "NCCLParallelContext") .def(py::init>( m, "BKCLParallelContext") .def(py::init>( m, "GLOOParallelContext") .def(py::init>( m, "HCCLParallelContext") .def(py::init>( m, "CNCLParallelContext") .def(py::init>( m, "HeterParallelContext") .def(py::init()) @@ -2376,42 +2598,56 @@ void BindImperative(py::module *m_ptr) { #endif m.def("pylayer_apply", - [](const platform::CPUPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::CPUPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); m.def("pylayer_apply", - [](const platform::CUDAPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::CUDAPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); m.def("pylayer_apply", - [](const platform::XPUPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::XPUPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); m.def("pylayer_apply", - [](const platform::CUDAPinnedPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::CUDAPinnedPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); m.def("pylayer_apply", - [](const platform::NPUPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::NPUPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); m.def("pylayer_apply", - [](const platform::MLUPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::MLUPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); m.def("pylayer_apply", - [](const platform::CustomPlace &place, const py::object &cls, - const py::args args, const py::kwargs kwargs) { + [](const platform::CustomPlace &place, + const py::object &cls, + const py::args args, + const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); @@ -2437,8 +2673,8 @@ void BindImperative(py::module *m_ptr) { SetUVATensorFromPyArray(new_tensor, array, device_id); } else if (py::isinstance>( array)) { - SetUVATensorFromPyArray(new_tensor, array, - device_id); + SetUVATensorFromPyArray( + new_tensor, array, device_id); } else if (py::isinstance>(array)) { SetUVATensorFromPyArray(new_tensor, array, device_id); } else { @@ -2452,8 +2688,10 @@ void BindImperative(py::module *m_ptr) { } return new_tensor; }, - py::arg("obj"), py::arg("device_id") = 0, - py::return_value_policy::reference, R"DOC( + py::arg("obj"), + py::arg("device_id") = 0, + py::return_value_policy::reference, + R"DOC( Returns tensor with the UVA(unified virtual addressing) created from numpy array. Args: @@ -2485,26 +2723,32 @@ void BindImperative(py::module *m_ptr) { #if defined(PADDLE_WITH_CUDA) m.def( "async_write", - [](const imperative::VarBase &src, imperative::VarBase &dst, - const imperative::VarBase &offset, const imperative::VarBase &count) { + [](const imperative::VarBase &src, + imperative::VarBase &dst, + const imperative::VarBase &offset, + const imperative::VarBase &count) { PADDLE_ENFORCE_EQ( - platform::is_gpu_place(src.Place()), true, + platform::is_gpu_place(src.Place()), + true, platform::errors::InvalidArgument( "Required `src` device should be CUDAPlace, but received %d. ", src.Place())); PADDLE_ENFORCE_EQ( - platform::is_cuda_pinned_place(dst.Place()), true, + platform::is_cuda_pinned_place(dst.Place()), + true, platform::errors::InvalidArgument( "Required `dst` device should be CUDAPinnedPlace, " "but received %d. ", dst.Place())); PADDLE_ENFORCE_EQ( - platform::is_cpu_place(offset.Place()), true, + platform::is_cpu_place(offset.Place()), + true, platform::errors::InvalidArgument("Required `offset` device should " "be CPUPlace, but received %d. ", offset.Place())); PADDLE_ENFORCE_EQ( - platform::is_cpu_place(count.Place()), true, + platform::is_cpu_place(count.Place()), + true, platform::errors::InvalidArgument( "Required `count` device should be CPUPlace, but received %d. ", count.Place())); @@ -2517,23 +2761,28 @@ void BindImperative(py::module *m_ptr) { auto &count_tensor = count.Var().Get(); const auto &deviceId = paddle::platform::GetCurrentDeviceId(); - PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), + 1, platform::errors::InvalidArgument( "`offset` tensor should be one-dimensional.")); - PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), + 1, platform::errors::InvalidArgument( "`count` tensor should be one-dimensional.")); - PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + PADDLE_ENFORCE_EQ(offset_tensor.numel(), + count_tensor.numel(), platform::errors::InvalidArgument( "`offset` and `count` tensor size dismatch.")); PADDLE_ENFORCE_EQ( - src_tensor.dims().size(), dst_tensor->dims().size(), + src_tensor.dims().size(), + dst_tensor->dims().size(), platform::errors::InvalidArgument( "`src` and `dst` should have the same tensor shape, " "except for the first dimension.")); for (int i = 1; i < src_tensor.dims().size(); i++) { PADDLE_ENFORCE_EQ( - src_tensor.dims()[i], dst_tensor->dims()[i], + src_tensor.dims()[i], + dst_tensor->dims()[i], platform::errors::InvalidArgument( "`src` and `dst` should have the same tensor shape, " "except for the first dimension.")); @@ -2550,15 +2799,19 @@ void BindImperative(py::module *m_ptr) { int64_t src_offset = 0, dst_offset, c; for (int64_t i = 0; i < offset_tensor.numel(); i++) { dst_offset = offset_data[i], c = count_data[i]; - PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0], + PADDLE_ENFORCE_LE(src_offset + c, + src_tensor.dims()[0], platform::errors::InvalidArgument( "Invalid offset or count index")); - PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0], + PADDLE_ENFORCE_LE(dst_offset + c, + dst_tensor->dims()[0], platform::errors::InvalidArgument( "Invalid offset or count index")); - cudaMemcpyAsync( - dst_data + (dst_offset * size), src_data + (src_offset * size), - c * size * sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(dst_data + (dst_offset * size), + src_data + (src_offset * size), + c * size * sizeof(float), + cudaMemcpyDeviceToHost, + stream); src_offset += c; } }, @@ -2615,37 +2868,46 @@ void BindImperative(py::module *m_ptr) { m.def( "async_read", - [](const imperative::VarBase &src, imperative::VarBase &dst, - const imperative::VarBase &index, imperative::VarBase &buffer, - const imperative::VarBase &offset, const imperative::VarBase &count) { - PADDLE_ENFORCE_EQ(platform::is_cuda_pinned_place(src.Place()), true, + [](const imperative::VarBase &src, + imperative::VarBase &dst, + const imperative::VarBase &index, + imperative::VarBase &buffer, + const imperative::VarBase &offset, + const imperative::VarBase &count) { + PADDLE_ENFORCE_EQ(platform::is_cuda_pinned_place(src.Place()), + true, platform::errors::InvalidArgument( "Required `src` device should be " "CUDAPinnedPlace, but received %d.", src.Place())); PADDLE_ENFORCE_EQ( - platform::is_gpu_place(dst.Place()), true, + platform::is_gpu_place(dst.Place()), + true, platform::errors::InvalidArgument( "Required `dst` device should be CUDAPlace, but received %d.", dst.Place())); PADDLE_ENFORCE_EQ( - platform::is_cpu_place(index.Place()), true, + platform::is_cpu_place(index.Place()), + true, platform::errors::InvalidArgument( "Required `index` device should be CPUPlace, but received %d.", index.Place())); PADDLE_ENFORCE_EQ( - platform::is_cuda_pinned_place(buffer.Place()), true, + platform::is_cuda_pinned_place(buffer.Place()), + true, platform::errors::InvalidArgument( "Required `buffer` device should be CUDAPinnedPlace, " "but received %d.", buffer.Place())); PADDLE_ENFORCE_EQ( - platform::is_cpu_place(offset.Place()), true, + platform::is_cpu_place(offset.Place()), + true, platform::errors::InvalidArgument( "Required `offset` device should be CPUPlace, but received %d.", offset.Place())); PADDLE_ENFORCE_EQ( - platform::is_cpu_place(count.Place()), true, + platform::is_cpu_place(count.Place()), + true, platform::errors::InvalidArgument( "Required `count` device should be CPUPlace, but received %d.", count.Place())); @@ -2660,28 +2922,33 @@ void BindImperative(py::module *m_ptr) { auto *dst_data = dst_tensor->mutable_data(dst.Place()); const auto &deviceId = paddle::platform::GetCurrentDeviceId(); - PADDLE_ENFORCE_EQ(src_tensor.dims().size(), dst_tensor->dims().size(), + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), + dst_tensor->dims().size(), platform::errors::InvalidArgument( "`src` and `dst` should have same tensor shape, " "except for the first dimension.")); PADDLE_ENFORCE_EQ( - src_tensor.dims().size(), buffer_tensor->dims().size(), + src_tensor.dims().size(), + buffer_tensor->dims().size(), platform::errors::InvalidArgument( "`src` and `buffer` should have same tensor shape, " "except for the first dimension.")); for (int i = 1; i < src_tensor.dims().size(); i++) { PADDLE_ENFORCE_EQ( - src_tensor.dims()[i], dst_tensor->dims()[i], + src_tensor.dims()[i], + dst_tensor->dims()[i], platform::errors::InvalidArgument( "`src` and `dst` should have the same tensor shape, " "except for the first dimension.")); PADDLE_ENFORCE_EQ( - src_tensor.dims()[i], buffer_tensor->dims()[i], + src_tensor.dims()[i], + buffer_tensor->dims()[i], platform::errors::InvalidArgument( "`src` and `buffer` should have the same tensor shape, " "except for the first dimension.")); } - PADDLE_ENFORCE_EQ(index_tensor.dims().size(), 1, + PADDLE_ENFORCE_EQ(index_tensor.dims().size(), + 1, platform::errors::InvalidArgument( "`index` tensor should be one-dimensional.")); @@ -2693,13 +2960,16 @@ void BindImperative(py::module *m_ptr) { int64_t size = src_tensor.numel() / src_tensor.dims()[0]; if (copy_flag != 0) { - PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), + 1, platform::errors::InvalidArgument( "`offset` tensor should be one-dimensional.")); - PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), + 1, platform::errors::InvalidArgument( "`count` tensor should be one-dimensional.")); - PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + PADDLE_ENFORCE_EQ(offset_tensor.numel(), + count_tensor.numel(), platform::errors::InvalidArgument( "`offset` and `count` tensor size dismatch.")); auto *offset_data = offset_tensor.data(); @@ -2711,7 +2981,8 @@ void BindImperative(py::module *m_ptr) { buffer_tensor->dims()[0], platform::errors::InvalidArgument( "Buffer tensor size is too small.")); - PADDLE_ENFORCE_LE(numel + index_tensor.numel(), dst_tensor->dims()[0], + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), + dst_tensor->dims()[0], platform::errors::InvalidArgument( "Target tensor size is too small.")); @@ -2719,19 +2990,24 @@ void BindImperative(py::module *m_ptr) { auto *src_data = src_tensor.data(); for (int64_t i = 0; i < offset_tensor.numel(); i++) { src_offset = offset_data[i], c = count_data[i]; - PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0], + PADDLE_ENFORCE_LE(src_offset + c, + src_tensor.dims()[0], platform::errors::InvalidArgument( "Invalid offset or count index.")); - PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0], + PADDLE_ENFORCE_LE(dst_offset + c, + dst_tensor->dims()[0], platform::errors::InvalidArgument( "Invalid offset or count index.")); - cudaMemcpyAsync( - dst_data + (dst_offset * size), src_data + (src_offset * size), - c * size * sizeof(float), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(dst_data + (dst_offset * size), + src_data + (src_offset * size), + c * size * sizeof(float), + cudaMemcpyHostToDevice, + stream); dst_offset += c; } } else { - PADDLE_ENFORCE_LE(index_tensor.numel(), buffer_tensor->dims()[0], + PADDLE_ENFORCE_LE(index_tensor.numel(), + buffer_tensor->dims()[0], platform::errors::InvalidArgument( "Buffer tensor size is too small.")); } @@ -2749,16 +3025,19 @@ void BindImperative(py::module *m_ptr) { int64_t c = 0; for (int64_t i = 0; i < index_tensor.numel(); i++) { std::memcpy(buffer_data + c * slice_size, - src_data + index_data[i] * slice_size, copy_bytes); + src_data + index_data[i] * slice_size, + copy_bytes); c += 1; } }; index_select(src_tensor, index_tensor, buffer_tensor); // Copy the data to device memory - cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data(), + cudaMemcpyAsync(dst_data + (numel * size), + buffer_tensor->data(), index_tensor.numel() * size * sizeof(float), - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, + stream); }, R"DOC( This api provides a way to read from pieces of source tensor to destination tensor