diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bddd2023b437b..13676ec910b05 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -21,6 +21,11 @@ else() set(ASCEND_DIR /usr/local/Ascend) endif() +if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) + # It means CANN 20.2 + + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + if(WITH_ASCEND) set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) @@ -43,9 +48,7 @@ if(WITH_ASCEND) set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) - add_definitions(-DPADDLE_WITH_ASCEND_STRING) - endif() + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) @@ -64,11 +67,13 @@ if(WITH_ASCEND_CL) set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) - set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) - message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}") + message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") - INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR}) + INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) + INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 101991d2c1ba0..e5bfbf4a8f779 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -456,11 +456,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #endif } else if (platform::is_npu_place(place_)) { #ifdef PADDLE_WITH_ASCEND_CL - // TODO(ascendrc): Support garbage collector on NPUPlace - VLOG(4) << "Skip NPU gc because it is not implemented now."; + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for NPU."; + gc.reset(new NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Please set FLAGS_fast_eager_deletion_mode=true to use " + "GarbageCollector on NPU.")); + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + VLOG(4) << "Use default stream gc for NPU."; + gc.reset(new NPUDefaultStreamGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } #else - PADDLE_THROW(platform::errors::Unimplemented( - "No NPU gc found in CPU/GPU/XPU paddle")); + PADDLE_THROW( + platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle")); #endif } } diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 8dfbd3c268b86..9ab6b5d8c178b 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -122,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback( } #endif +#ifdef PADDLE_WITH_ASCEND_CL +NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUDefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void NPUDefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} +NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUUnsafeFastGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +#endif + int64_t GetEagerDeletionThreshold() { return FLAGS_eager_delete_tensor_gb < 0 ? -1 diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 572c79d21a045..2c2b57bbe420a 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -131,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL +class NPUDefaultStreamGarbageCollector : public GarbageCollector { + public: + NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class NPUUnsafeFastGarbageCollector : public GarbageCollector { + public: + NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; +#endif + template void GarbageCollector::Add(Container &&objs) { Add(std::forward(objs), []() {}); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 4c52932976122..818da7478b239 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx buf(new char[kBufSize]); + auto& npu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + BOOST_GET_CONST(platform::NPUPlace, tensor.place()), + reinterpret_cast(data), size_to_write, + npu_dev_ctx.stream()); + npu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); #endif } else { os.write(static_cast(data_ptr), @@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { @@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 85af9e5008702..22c8e1c1665f1 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -159,11 +159,15 @@ void TensorFromVector(const std::vector& src, } #endif #ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from + // cudaMemcpyAsync. + // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. + // aclrtMemcpyAsync is really "async" between cpu <-> npu. + // Since vector is on cpu, I think this function should be a "sync" operation, + // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif } @@ -202,10 +206,8 @@ inline void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif delete[] array; @@ -265,10 +267,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif } @@ -301,10 +302,9 @@ inline void TensorToVector(const Tensor& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif for (unsigned int i = 0; i < src.numel(); i++) { diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 1eb0535831bb1..730d49e8acd93 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -207,12 +207,6 @@ void Copy(platform::NPUPlace dst_place, platform::SetNPUDeviceId(dst_place.device); - // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -220,6 +214,12 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } @@ -235,12 +235,6 @@ void Copy(platform::CPUPlace dst_place, platform::SetNPUDeviceId(src_place.device); - // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -248,6 +242,9 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } @@ -270,6 +267,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } @@ -284,6 +285,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index cecc70cc6dda8..3aa47c9e092f7 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -167,7 +167,7 @@ endif() if (WITH_ASCEND_CL) cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) - cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op) + cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 923b581af287d..f368c65823055 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -77,8 +77,7 @@ class PowGradNPUKernel : public framework::OpKernel { // 2.1 Get a factor tensor with shape [1]. Tensor factor_tensor(framework::proto::VarType::FP32); factor_tensor.mutable_data({1}, place); - TensorFromVector(std::vector{factor}, ctx.device_context(), - &factor_tensor); + FillNpuTensorWithConstant(&factor_tensor, factor); // 2.2 Get the factor which has the shape with x and the same value with // factor. diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc index 46f9f7ff08944..21968dcb05dd1 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -44,10 +44,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { // step1: inverse scale(RealDiv) Tensor const_tensor; const_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(1.0)}, ctx.device_context(), - &const_tensor); - - ctx.template device_context().Wait(); + FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); // Inverse(1.0/scale) Tensor* tmp_inverse_out = const_cast(scale); @@ -61,7 +58,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { size_t x_size = xs.size(); for (size_t i = 0; i < x_size; ++i) { - found_inf_data = true; const auto* x = xs[i]; auto* out = outs[i]; out->mutable_data(ctx.GetPlace()); @@ -77,6 +73,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { NpuOpRunner("CheckNumerics", {*x}, {check_xout}, {{"message", std::string("check_nan_and_inf")}}); runner_checknumerics.Run(stream); + ctx.template device_context() + .Wait(); } catch (platform::EnforceNotMet& exception) { LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; found_inf_data = true; @@ -104,7 +102,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { bool* is_found_inf = found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); *is_found_inf = true; - framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf); + + framework::TensorCopy( + found_inf_tensor, ctx.GetPlace(), + ctx.template device_context(), found_inf); + ctx.template device_context().Wait(); } } }; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc index 99e81a4757d0e..a80b83f0cbe51 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -110,22 +110,22 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) { // out found_inf Tensor found_inf_tensor; found_inf_tensor.Resize({1}); - bool *is_finite_data = + bool *found_inf_data = found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); f::TensorCopy(*found_inf, place, &found_inf_tensor); - EXPECT_FALSE(*is_finite_data); + EXPECT_TRUE(*found_inf_data); ctx.Wait(); } TEST(check_finite_and_unscale, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(check_finite_and_unscale, NPU_fp16) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index dd6dbfd5c0b65..45b28bf61e5d6 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -41,7 +41,7 @@ void Update(const platform::NPUDeviceContext& ctx, // bad_out_data = bad_in_data + 1 Tensor factor_tensor(bad_out_tensor->type()); factor_tensor.mutable_data({1}, place); - TensorFromVector(std::vector{1}, ctx, &factor_tensor); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, {*bad_out_tensor}, {}); runner_p2.Run(stream); @@ -84,7 +84,7 @@ void Update(const platform::NPUDeviceContext& ctx, // good_out_data = good_in_data + 1 Tensor factor_tensor(good_out_tensor->type()); factor_tensor.mutable_data({1}, place); - TensorFromVector(std::vector{1}, ctx, &factor_tensor); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, {*good_out_tensor}, {}); runner_p2.Run(stream); diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 5cf1303a229a9..792d01a5efe43 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -75,6 +75,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TEST(assign, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "assign"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "assign"); } diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc index 20b33c4e4e05a..8d34e0ba99c2c 100644 --- a/paddle/fluid/operators/cast_op_npu.cc +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -41,11 +40,20 @@ class CastNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); int dtype = ctx.Attr("out_dtype"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); + if (x->type() == dtype) { + // NOTE(zhiqiu): NPU cast op may result in wrong value, so + // add special case here. + VLOG(4) << "cast to same dtype:" << dtype; + out->mutable_data(place, x->type()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), out); + return; + } + auto iter = DTYPE_2_ACL_DTYPE.find( static_cast(dtype)); int aclDtype = iter->second; @@ -76,7 +84,7 @@ class CastNPUKernel : public framework::OpKernel { } }; } // namespace operators -} // namespace paddleaclDtype +} // namespace paddle namespace ops = paddle::operators; @@ -89,4 +97,3 @@ REGISTER_OP_NPU_KERNEL( ops::CastNPUKernel, ops::CastNPUKernel); -#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index 5b8d08a8943dd..3768748931ded 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -100,9 +100,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { - ctx.template device_context() - .Wait(); - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + framework::TensorCopy( + *tmp_dout, ctx.GetPlace(), + ctx.template device_context(), dx); } } @@ -127,8 +127,6 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", false}}); runner.Run(stream); tmp_dout = &reduced_dout; - ctx.template device_context() - .Wait(); } // stage 2 @@ -144,9 +142,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { - ctx.template device_context() - .Wait(); - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy); + framework::TensorCopy( + *tmp_dout, ctx.GetPlace(), + ctx.template device_context(), dy); } } } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index df6fae6c8484a..f06dbd26873a6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -38,7 +38,7 @@ USE_OP(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); template -void Compare(f::Scope* scope, const p::DeviceContext& ctx, +void Compare(f::Scope *scope, const p::DeviceContext &ctx, std::string op_type) { // init auto x = scope->Var("X"); @@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_y, ctx, tensor_y); tensor_y->Resize({10, 10}); - ctx.Wait(); - auto place = ctx.GetPlace(); auto out = scope->Var("Out"); auto tensor_out = out->GetMutable(); @@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, {{"Out", {"Out"}}}, attrs); op->Run(*scope, place); - ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -93,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, } template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, +void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx, std::string op_type) { // init auto dout = scope->Var("DOut"); @@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({2, 3, 5}); - ctx.Wait(); - // run f::AttributeMap attrs; auto op = f::OpRegistry::CreateOp( @@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, auto place = ctx.GetPlace(); op->Run(*scope, place); - ctx.Wait(); std::vector dx_vec; TensorToVector(*tensor_dx, ctx, &dx_vec); @@ -160,30 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TEST(elementwise_add, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_add"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "elementwise_add"); } TEST(elementwise_sub, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_sub"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "elementwise_sub"); } TEST(elementwise_sub, NPU_fp16) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_sub"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "elementwise_sub"); } TEST(elementwise_sub_grad, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "elementwise_sub_grad"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx, "elementwise_sub_grad"); } TEST(elementwise_add_grad, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "elementwise_add_grad"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx, "elementwise_add_grad"); } diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index 809445c286203..a6e438f8016e0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -102,7 +102,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + framework::TensorCopy( + *tmp_dout, ctx.GetPlace(), + ctx.template device_context(), dx); } } if (dy) { diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index f4ae1785b024f..453a990efbded 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -58,9 +58,11 @@ class ExpandNPUKernel : public framework::OpKernel { expand_times.size(), static_cast(in_dims.size()))); auto* out0 = context.Output("Out"); framework::DDim out_dims(in_dims); + for (size_t i = 0; i < expand_times.size(); ++i) { out_dims[i] *= expand_times[i]; } + out0->Resize(out_dims); out0->mutable_data(context.device_context().GetPlace()); auto runner = diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index 95f7865a8a3a4..880eb341f2093 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -69,6 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(expand, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 9d5499e00c82f..4ea4c11c47835 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { Tensor tensor_tmp(data_type); tensor_tmp.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{value}, ctx.device_context(), &tensor_tmp); + FillNpuTensorWithConstant(&tensor_tmp, value); out_var->mutable_data(shape, place); auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index 8a487234ad94a..1ee8889995f4d 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -50,6 +50,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel { auto *x = ctx.Input("X"); auto *dout = ctx.Input(framework::GradVarName("Out")); auto *dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); // step1: Unsqueeze index framework::Tensor tmp_tensor(index->type()); @@ -66,7 +67,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel { .stream(); // step2: ZerosLike x in device - Tensor zeroslike_xout(x->type()); + Tensor zeroslike_xout(dx->type()); zeroslike_xout.Resize(x->dims()); auto p = zeroslike_xout.mutable_data(ctx.GetPlace()); @@ -74,7 +75,6 @@ class GatherGradOpNPUKernel : public framework::OpKernel { zeroslike_xout.numel() * sizeof(T), stream); // step3: scatter(x_grad) - dx->mutable_data(ctx.GetPlace()); auto runner_scatter = NpuOpRunner( "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); runner_scatter.Run(stream); diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index de067e45585d9..31e19d8f600c3 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -152,18 +152,18 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TEST(gather, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "gather"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "gather"); } TEST(gather, NPU_fp16) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "gather"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "gather"); } TEST(gather_grad, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "gather_grad"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx, "gather_grad"); } diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index f11812ce3bb21..830dcd5983901 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -157,12 +157,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { TEST(gelu, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(gelu_grad, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx); } diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc index c1859bce02c90..7d75e385e8f3b 100644 --- a/paddle/fluid/operators/increment_op_npu.cc +++ b/paddle/fluid/operators/increment_op_npu.cc @@ -39,10 +39,9 @@ class IncrementalNPUKernel : public framework::OpKernel { out_tensor->mutable_data(context.GetPlace()); Tensor step_tensor(x_tensor->type()); - std::vector step_vec; - step_vec.push_back(static_cast(step)); - framework::TensorFromVector(step_vec, context.device_context(), - &step_tensor); + + step_tensor.mutable_data({1}, context.GetPlace()); + FillNpuTensorWithConstant(&step_tensor, static_cast(step)); auto runner = NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index b466ae275dd1c..bde349b0a33b9 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -71,12 +71,12 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TEST(increment, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "increment"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "increment"); } TEST(increment, NPU_fp64) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "increment"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "increment"); } diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc index 95549319cd209..c0c228ef22af3 100644 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -80,8 +80,7 @@ class LayerNormNPUKernel : public framework::OpKernel { default_scale.mutable_data(framework::make_ddim(axes), place); Tensor value(x->type()); value.mutable_data({1}, place); - TensorFromVector(std::vector{static_cast(1.0)}, - ctx.device_context(), &value); + FillNpuTensorWithConstant(&value, static_cast(1.0)); auto runner = NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); runner.Run(stream); @@ -95,8 +94,7 @@ class LayerNormNPUKernel : public framework::OpKernel { default_bias.mutable_data(framework::make_ddim(axes), place); Tensor value(x->type()); value.mutable_data({1}, place); - TensorFromVector(std::vector{static_cast(0)}, ctx.device_context(), - &value); + FillNpuTensorWithConstant(&value, static_cast(0)); auto runner = NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); runner.Run(stream); @@ -251,8 +249,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { default_scale.mutable_data(framework::make_ddim(axes), place); Tensor value(x->type()); value.mutable_data({1}, place); - TensorFromVector(std::vector{static_cast(1.0)}, - ctx.device_context(), &value); + FillNpuTensorWithConstant(&value, static_cast(1.0)); auto runner = NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); runner.Run(stream); diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc new file mode 100644 index 0000000000000..4b9b96c23b0b7 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op_npu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc new file mode 100644 index 0000000000000..1f53280345831 --- /dev/null +++ b/paddle/fluid/operators/load_op_npu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index fab2d7f7aa054..f614d906baa75 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -28,6 +28,12 @@ class LookupTableV2NPUKernel : public framework::OpKernel { auto *ids_t = ctx.Input("Ids"); // int tensor auto *output_t = ctx.Output("Out"); // float tensor auto *table_t = ctx.Input("W"); + + // It seems cann 20.1 accepts int64, but cann 20.2+ not. + PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32, + platform::errors::Unimplemented( + "The index of LookupTableV2 should be int32.")); + auto *table_var = ctx.InputVar("W"); PADDLE_ENFORCE_EQ( table_var->IsType(), true, @@ -49,28 +55,26 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto *ids_t = ctx.Input("Ids"); + auto *output_grad_t = ctx.Input(framework::GradVarName("Out")); auto *table_grad_t = ctx.Output(framework::GradVarName("W")); - table_grad_t->mutable_data(ctx.GetPlace()); + auto *p = table_grad_t->mutable_data(ctx.GetPlace()); auto stream = ctx.template device_context() .stream(); - // step2: ZerosLike x in device - Tensor zeroslike_w(table_grad_t->type()); - zeroslike_w.Resize(table_grad_t->dims()); - auto p = zeroslike_w.mutable_data(ctx.GetPlace()); - platform::NPUMemsetAsync(static_cast(p), 0, - zeroslike_w.numel() * sizeof(T), stream); + table_grad_t->numel() * sizeof(T), stream); - table_grad_t->mutable_data(ctx.GetPlace()); + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. auto runner_scatter = - NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t}, - {*table_grad_t}, {}); + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); runner_scatter.Run(stream); } }; diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc deleted file mode 100644 index f37915834bd75..0000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/string/printf.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; -namespace m = paddle::operators::math; - -USE_OP(lookup_table_v2); -USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto ids = scope->Var("Ids"); - auto out = scope->Var("Out"); - auto w = scope->Var("W"); - - auto ids_t = ids->GetMutable(); - auto out_t = out->GetMutable(); - auto w_t = w->GetMutable(); - int bsz = 10; - int dim = 32; - int seqlen = 8; - int vocab_size = 100; - TensorFromVector(std::vector(bsz * seqlen, 3), ctx, ids_t); - std::vector val(vocab_size * dim, 10.); - TensorFromVector(val, ctx, w_t); - ids_t->Resize({bsz, seqlen}); - w_t->Resize({vocab_size, dim}); - out_t->Resize({bsz, seqlen, dim}); - ctx.Wait(); - - auto place = ctx.GetPlace(); - out_t->mutable_data(place); - f::AttributeMap attrs = {{}}; - auto op = f::OpRegistry::CreateOp("lookup_table_v2", - {{"W", {"W"}}, {"Ids", {"Ids"}}}, - {{"Out", {"Out"}}}, attrs); - op->Run(*scope, place); - std::vector out_v; - TensorToVector(*out_t, ctx, &out_v); - ctx.Wait(); - EXPECT_EQ(out_t->numel(), bsz * seqlen * dim); - T res = std::accumulate(out_v.begin(), out_v.end(), 0.); - float eps = 1.e-6; - EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto w = scope->Var("W"); - auto ids = scope->Var("Ids"); - auto out = scope->Var("DOut"); - auto dw = scope->Var("DW"); - - auto w_t = w->GetMutable(); - auto ids_t = ids->GetMutable(); - auto out_t = out->GetMutable(); - auto dw_t = dw->GetMutable(); - - int bsz = 2; - int dim = 2; - int seqlen = 2; - int vocab_size = 4; - - std::vector val_int(bsz * seqlen, 3); - std::vector val(vocab_size * dim, 0.); - std::vector val_out(bsz * seqlen * dim, 1.); - - TensorFromVector(val_int, ctx, ids_t); - TensorFromVector(val, ctx, w_t); - TensorFromVector(val, ctx, dw_t); - TensorFromVector(val_out, ctx, out_t); - - w_t->Resize({vocab_size, dim}); - ids_t->Resize({bsz, seqlen}); - out_t->Resize({bsz, seqlen, dim}); - dw_t->Resize({vocab_size, dim}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - out_t->mutable_data(place); - w_t->mutable_data(place); - dw_t->mutable_data(place); - f::AttributeMap attrs = {{}}; - auto op = f::OpRegistry::CreateOp( - "lookup_table_v2_grad", - {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}}, - {{"W@GRAD", {"DW"}}}, attrs); - op->Run(*scope, place); - ctx.Wait(); - std::vector w_v; - TensorToVector(*dw_t, ctx, &w_v); - ctx.Wait(); - EXPECT_EQ(dw_t->numel(), vocab_size * dim); - T res = std::accumulate(w_v.begin(), w_v.end(), 0.); - float eps = 1.e-6; - EXPECT_LT(fabs(res - bsz * seqlen * dim), eps); -} - -TEST(lookup_table_v2, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); -} - -TEST(lookup_table_v2_grad, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); -} diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc index 676086bd08063..d6e982039fa29 100644 --- a/paddle/fluid/operators/mean_op_npu.cc +++ b/paddle/fluid/operators/mean_op_npu.cc @@ -68,10 +68,8 @@ class MeanGradNPUKernel : public framework::OpKernel { Tensor mean_tensor(grad->type()); mean_tensor.Resize({1}); mean_tensor.mutable_data(context.GetPlace()); - std::vector mean_vec; - mean_vec.push_back(1.0 / static_cast(IG->numel())); - framework::TensorFromVector(mean_vec, context.device_context(), - &mean_tensor); + FillNpuTensorWithConstant( + &mean_tensor, static_cast(1.0 / static_cast(IG->numel()))); // means mul ones Tensor mean_ma(grad->type()); diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc index d10d5bf12e6b4..4e10498efa10c 100644 --- a/paddle/fluid/operators/memcpy_op.cc +++ b/paddle/fluid/operators/memcpy_op.cc @@ -105,16 +105,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker { "is the same as input X."); AddAttr("dst_place_type", "Determine the dst place of tensor copy. " - "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other " - "place type is Unimplemented and will cause ERROR." + "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or " + "NPUPlace <-> CPUPlace. " + "Other place type is Unimplemented and will cause ERROR." "0: dst is on CPUPlace. " "1: dst is on CUDAPlace. " "2: dst is on CUDAPinnedPlace. " - "3: dst is on XPUPlace. "); + "3: dst is on XPUPlace. " + "4: dst is on NPUPlace. "); AddComment(R"DOC( Memcpy Operator. - By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace, - and used as an internal op by Recompute-Offload. + By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or + NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload. You would have to update it if you want other more capacities. Out = X, when type in [LoDTensor] @@ -146,3 +148,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, plat::float16, ops::MemcpyKernel); #endif + +#ifdef PADDLE_WITH_ASCEND_CL +REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, + ops::MemcpyKernel, int, ops::MemcpyKernel, + int64_t, ops::MemcpyKernel, bool, + ops::MemcpyKernel, plat::float16, + ops::MemcpyKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h old mode 100755 new mode 100644 index f81ca05f4380a..63a41cc723731 --- a/paddle/fluid/operators/memcpy_op.h +++ b/paddle/fluid/operators/memcpy_op.h @@ -51,7 +51,17 @@ class MemcpyFunctor { } else if (dst_place_type_ == 1) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); - } else { + } +#ifdef PADDLE_WITH_ASCEND_CL + else if (dst_place_type_ == 0) { // NOLINT + framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_, + &out_tensor); + } else if (dst_place_type_ == 4) { + framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, + &out_tensor); + } +#endif + else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "memcpy dst_place_type: %d is not supported yet.", dst_place_type_)); } diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index aa0c4d2dfd274..276bfa7b3281b 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -aclrtStream GetCurrentNPUStream() { - int device_id = platform::GetCurrentNPUDeviceId(); +aclrtStream GetCurrentNPUStream(int device_id) { + if (device_id == -1) { + device_id = platform::GetCurrentNPUDeviceId(); + } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast( pool.Get(platform::NPUPlace(device_id))); @@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) { VLOG(4) << "after aclopCompileAndExecute: " << ret; PADDLE_ENFORCE_NPU_SUCCESS(ret); } + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index e178f7fc6e96d..5506ddd89692b 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -86,6 +86,44 @@ class NpuOpRunner { aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); +aclrtStream GetCurrentNPUStream(int device_id = -1); + +template +void FillNpuTensorWithConstant(Tensor *tensor, T val) { + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("The tensor should be initialized.")); + PADDLE_ENFORCE_EQ( + platform::is_npu_place(tensor->place()), true, + platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); + // do async for better performance + if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { + Tensor tmp(tensor->type()); + tmp.Resize(tensor->dims()); + tmp.mutable_data(tensor->place()); + auto stream = GetCurrentNPUStream( + BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); + platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), + stream); + auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(val)}}); + runner.Run(stream); + } else { + T *array = new T[tensor->numel()]; + for (unsigned int i = 0; i < tensor->numel(); ++i) { + array[i] = static_cast(val); + } + std::vector vec(tensor->numel(), static_cast(val)); + // do sync copy + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), + tensor->data(), platform::CPUPlace(), array, + tensor->numel() * sizeof(T), nullptr); + delete[] array; + } +} + } // namespace operators } // namespace paddle #endif diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 134544c2f65bc..b024aca21c382 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -61,8 +61,23 @@ class AdamNPUKernel : public framework::OpKernel { param_out->mutable_data(ctx.GetPlace()); mom1_out->mutable_data(ctx.GetPlace()); mom2_out->mutable_data(ctx.GetPlace()); - beta1_pow_out->mutable_data(ctx.GetPlace()); - beta2_pow_out->mutable_data(ctx.GetPlace()); + + // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place. + if (beta1_pow->place() == platform::CPUPlace()) { + T beta1 = *beta1_pow->data(); + // `mutable_data` operation needs to be done after getting data + beta1_pow_out->mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(beta1_pow_out, beta1); + } else { + beta1_pow_out->mutable_data(ctx.GetPlace()); + } + if (beta2_pow->place() == platform::CPUPlace()) { + T beta2 = *beta2_pow->data(); + beta2_pow_out->mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(beta2_pow_out, beta2); + } else { + beta2_pow_out->mutable_data(ctx.GetPlace()); + } T beta1 = static_cast(ctx.Attr("beta1")); if (ctx.HasInput("Beta1Tensor")) { @@ -100,18 +115,15 @@ class AdamNPUKernel : public framework::OpKernel { // reshape Tensor beta1_tensor(framework::proto::VarType::FP32); - beta1_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{beta1}, ctx.device_context(), - &beta1_tensor); + beta1_tensor.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta1_tensor, beta1); Tensor beta2_tensor(framework::proto::VarType::FP32); - beta2_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{beta2}, ctx.device_context(), - &beta2_tensor); + beta2_tensor.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta2_tensor, beta2); Tensor epsilon_tensor(framework::proto::VarType::FP32); epsilon_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{epsilon}, ctx.device_context(), - &epsilon_tensor); + FillNpuTensorWithConstant(&epsilon_tensor, epsilon); auto stream = ctx.template device_context() .stream(); @@ -130,16 +142,19 @@ class AdamNPUKernel : public framework::OpKernel { // NOTE(zhiqiu): ApplyAdamD updates params inplace, so // if param and param_out is not same, we need to do copy. if (param_out->data() != param->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*param, ctx.GetPlace(), param_out); + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); } if (mom1_out->data() != mom1->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); } if (mom2_out->data() != mom2->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); } auto runner_m1 = NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {}); diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc index b7aaff5d45791..a8d19148ef520 100644 --- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc +++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc @@ -44,8 +44,9 @@ class SGDNPUKernel : public framework::OpKernel { // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so // if param and param_out is not same, we need to do copy. if (param_out->data() != param_var->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out); + framework::TensorCopy( + *param_var, ctx.GetPlace(), + ctx.template device_context(), param_out); } } }; diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc index 228372e1e93e0..a9a2effd2eb9d 100644 --- a/paddle/fluid/operators/range_op_npu.cc +++ b/paddle/fluid/operators/range_op_npu.cc @@ -39,11 +39,23 @@ class RangeNPUKernel : public framework::OpKernel { auto* out = context.Output("Out"); framework::Tensor n; - framework::TensorCopySync(*start_t, platform::CPUPlace(), &n); + framework::TensorCopy( + *start_t, platform::CPUPlace(), + context.template device_context(), &n); + context.template device_context() + .Wait(); T start = n.data()[0]; - framework::TensorCopySync(*end_t, platform::CPUPlace(), &n); + framework::TensorCopy( + *end_t, platform::CPUPlace(), + context.template device_context(), &n); + context.template device_context() + .Wait(); T end = n.data()[0]; - framework::TensorCopySync(*step_t, platform::CPUPlace(), &n); + framework::TensorCopy( + *step_t, platform::CPUPlace(), + context.template device_context(), &n); + context.template device_context() + .Wait(); T step = n.data()[0]; int64_t size = 0; diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 562a560b2f154..f2f395314c0cc 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -87,6 +87,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TEST(range, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "range"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "range"); } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index b29493404f453..f5d55791d86c6 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -53,9 +53,25 @@ BufferedReader::BufferedReader( stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(place_)) { + int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device; + compute_stream_ = + ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::NpuEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); + } +#endif is_same_place_ = false; cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); + npu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -196,7 +212,59 @@ void BufferedReader::ReadAsync(size_t i) { #endif } } -#endif // @} End Group GPU Place +#endif + +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(place_)) { + TensorVec &npu = npu_buffer_[i]; + if (npu.empty()) { + npu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + npu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on NPU and CPU devices are not matched. " + "The number on NPU is %d, on CPU is %d", + npu.size(), cpu.size())); + } + + std::vector npu_ptrs; + npu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + npu[i].Resize(cpu[i].dims()); + npu[i].set_layout(cpu[i].layout()); + npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetNPUDeviceId( + BOOST_GET_CONST(platform::NPUPlace, place_).device); + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtRecordEvent(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtStreamWaitEvent(stream_.get(), events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy"); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto npu_ptr = npu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); + if ((platform::is_npu_place(cpu_place))) { + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr, + BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr, + size, stream_.get()); + } else { + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr, + BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr, + size, stream_.get()); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get())); + } + npu[i].set_lod(cpu[i].lod()); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get())); + } +#endif return i; })); } @@ -228,9 +296,13 @@ void BufferedReader::ReadNextImpl(std::vector *out) { return; } - *out = std::move((platform::is_gpu_place(place_) && !is_same_place_) - ? cuda_buffer_[i] - : cpu_buffer_[i]); + if (platform::is_gpu_place(place_) && !is_same_place_) { + *out = std::move(cuda_buffer_[i]); + } else if (platform::is_npu_place(place_) && !is_same_place_) { + *out = std::move(npu_buffer_[i]); + } else { + *out = std::move(cpu_buffer_[i]); + } // Do not push current position into ReadAsync. Push the previous position // Since all computation in fluid are async, change the data of diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index fbc46aceb8130..9f7b0e753281e 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -25,7 +25,10 @@ #include "paddle/fluid/platform/cuda_resource_pool.h" #include "paddle/fluid/platform/gpu_info.h" #endif - +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/npu_resource_pool.h" +#endif namespace paddle { namespace operators { namespace reader { @@ -67,12 +70,19 @@ class BufferedReader : public framework::DecoratedReader { bool is_same_place_; std::vector cpu_buffer_; std::vector cuda_buffer_; + std::vector npu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_ASCEND_CL + aclrtStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc index d408ff3988f03..1eeeb5e1f8aa1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -78,6 +78,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(reduce_any, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc new file mode 100644 index 0000000000000..1fb136a5110db --- /dev/null +++ b/paddle/fluid/operators/save_combine_op_npu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc new file mode 100644 index 0000000000000..90db1a0bb85d6 --- /dev/null +++ b/paddle/fluid/operators/save_op_npu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + save, ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index f06f59f3b4e00..d20b3ac04bf95 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -159,12 +159,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { TEST(softmax, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(softmax_grad, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx); } diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc index c777a02f96bd9..a34946315f5a8 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -67,12 +67,10 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { // on and off Tensor on_tensor(framework::proto::VarType::INT32); on_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(1)}, - ctx.device_context(), &on_tensor); + FillNpuTensorWithConstant(&on_tensor, static_cast(1)); Tensor off_tensor(framework::proto::VarType::INT32); off_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(0)}, - ctx.device_context(), &off_tensor); + FillNpuTensorWithConstant(&off_tensor, static_cast(0)); // one_hot Tensor tmp_onehot(on_tensor.type()); @@ -142,12 +140,10 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { // on and off Tensor on_tensor(framework::proto::VarType::INT32); on_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(1)}, - ctx.device_context(), &on_tensor); + FillNpuTensorWithConstant(&on_tensor, static_cast(1)); Tensor off_tensor(framework::proto::VarType::INT32); off_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(0)}, - ctx.device_context(), &off_tensor); + FillNpuTensorWithConstant(&off_tensor, static_cast(0)); // one_hot Tensor tmp_onehot(on_tensor.type()); diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 22dc81cbd79e0..1de7ca8c7bdbf 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(squeeze, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc index 046ae90ec7c6e..f1b64f042c3c0 100644 --- a/paddle/fluid/operators/tensor_formatter.cc +++ b/paddle/fluid/operators/tensor_formatter.cc @@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor, framework::LoDTensor cpu_tensor; platform::CPUPlace cpu_place; TensorCopy(print_tensor, cpu_place, &cpu_tensor); +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(print_tensor.place())) { + platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait(); + } +#endif data = cpu_tensor.data(); } diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index 36f7a69535851..f6712814e1e3b 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -126,12 +126,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { TEST(transpose2, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(transpose2_grad, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx); } diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 4253187fdde74..7f3190d9112c6 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -35,28 +35,24 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { float mean = ctx.Attr("mean"); Tensor mean_tensor(framework::proto::VarType::FP32); mean_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{mean}, ctx.device_context(), - &mean_tensor); + FillNpuTensorWithConstant(&mean_tensor, mean); float std = ctx.Attr("std"); Tensor std_tensor(framework::proto::VarType::FP32); std_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{std}, ctx.device_context(), - &std_tensor); + FillNpuTensorWithConstant(&std_tensor, std); int32_t seed_var = ctx.Attr("seed"); Tensor min_tensor(framework::proto::VarType::FP32); min_tensor.mutable_data({1}, ctx.GetPlace()); float min_value = mean - std * 2.0; - TensorFromVector(std::vector{min_value}, ctx.device_context(), - &min_tensor); + FillNpuTensorWithConstant(&min_tensor, min_value); Tensor max_tensor(framework::proto::VarType::FP32); max_tensor.mutable_data({1}, ctx.GetPlace()); float max_value = mean + std * 2.0; - TensorFromVector(std::vector{max_value}, ctx.device_context(), - &max_tensor); + FillNpuTensorWithConstant(&max_tensor, max_value); auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 9b4485047f05c..a145c914a8621 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(unsqueeze, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 584dbd4756aa0..1d3fc14cdd16d 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -143,6 +143,11 @@ if(WITH_GPU OR WITH_ROCM) target_link_libraries(device_context cuda_resource_pool) endif() +if(WITH_ASCEND_CL) + cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info) + target_link_libraries(device_context npu_resource_pool) +endif() + cc_test(init_test SRCS init_test.cc DEPS device_context) if(WITH_GPU) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a0ade3898c336..50bb64d557444 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif - #include "glog/logging.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace memory { @@ -254,8 +254,9 @@ NPUDeviceContext::~NPUDeviceContext() { } void NPUDeviceContext::Wait() const { - NPUDeviceGuard guard(place_.device); - PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); + platform::RecordEvent record_event("NPUDeviceContext/wait"); + VLOG(4) << "NPU context(" << this << ") Wait"; + stream_->Wait(); } aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 2578c9b6cdea5..3d72727c8da8e 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -197,6 +197,13 @@ class NPUDeviceContext : public DeviceContext { void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; } #endif + template + void AddStreamCallback(Callback&& callback) const { + return stream_->AddCallback(callback); + } + + void WaitStreamCallback() const { return stream_->WaitCallback(); } + private: NPUPlace place_; aclrtContext context_; diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 717b5ce83c6c9..724a9b8483cde 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer { BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId()); } else if (platform::is_cuda_pinned_place(r.place)) { event->set_place(proto::MemEvent::CUDAPinnedPlace); + } else if (platform::is_npu_place(r.place)) { + event->set_place(proto::MemEvent::NPUPlace); } else { PADDLE_THROW(platform::errors::Unimplemented( "The current place is not supported.")); diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h new file mode 100644 index 0000000000000..a7b674d0d0c3f --- /dev/null +++ b/paddle/fluid/platform/npu_profiler.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "acl/acl_prof.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +#ifdef PADDLE_WITH_ASCEND_STRING +// For CANN 20.2+ +// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats +// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline +// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory +// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory +// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio +constexpr aclprofAicoreMetrics default_metrics = + ACL_AICORE_ARITHMETIC_UTILIZATION; +#else +// For CANN 20.1 +// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats +// ACL_AICORE_PIPELINE = 1, record pipeline +// ACL_AICORE_SYNCHRONIZATION = 2, record sync +// ACL_AICORE_MEMORY = 3, recore memory +// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory +// ACL_AICORE_STALL = 5, record pipeline ratio +constexpr aclprofAicoreMetrics default_metrics = + ACL_AICORE_ARITHMATIC_THROUGHPUT; +#endif + +// ACL_PROF_ACL_API, record ACL API stats +// ACL_PROF_TASK_TIME, record AI core stats +// ACL_PROF_AICORE_METRICS, must include +// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet +constexpr uint64_t default_type = + ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME; + +aclprofConfig *NPUProfilerCreateConfig( + std::vector devices = {}, + aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type, + aclprofAicoreEvents *events = nullptr) { + if (devices.size() == 0) { + int device_id = GetCurrentNPUDeviceId(); + devices.emplace_back(device_id); + } + aclprofConfig *config = + aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c); + PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External( + "Failed to create prof config for NPU")); + return config; +} + +void NPUProfilerDestroyConfig(const aclprofConfig *config) { + PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config)); +} + +void NPUProfilerInit(std::string output_path) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclprofInit(output_path.c_str(), output_path.size())); +} + +void NPUProfilerStart(const aclprofConfig *config) { + if (config == nullptr) { + // NOTE(zhiqiu): support single device by default. + int device_id = GetCurrentNPUDeviceId(); + std::vector devices = {static_cast(device_id)}; + config = NPUProfilerCreateConfig(devices); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config)); +} + +void NPUProfilerStop(const aclprofConfig *config) { + PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config)); + NPUProfilerDestroyConfig(config); +} + +void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); } + +struct NPUProfConfigWrapper { + aclprofConfig *p_; + explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {} + aclprofConfig *ptr() { return p_; } +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/npu_resource_pool.cc b/paddle/fluid/platform/npu_resource_pool.cc new file mode 100644 index 0000000000000..22b9e8f03971e --- /dev/null +++ b/paddle/fluid/platform/npu_resource_pool.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/npu_resource_pool.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace platform { + +NpuStreamResourcePool::NpuStreamResourcePool() { + int dev_cnt = platform::GetNPUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetNPUDeviceId(dev_idx); + aclrtStream stream; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream)); + return stream; + }; + + auto deleter = [dev_idx](aclrtStream stream) { + platform::SetNPUDeviceId(dev_idx); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream)); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +NpuStreamResourcePool& NpuStreamResourcePool::Instance() { + static NpuStreamResourcePool pool; + return pool; +} + +std::shared_ptr NpuStreamResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +NpuEventResourcePool::NpuEventResourcePool() { + int dev_cnt = platform::GetNPUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetNPUDeviceId(dev_idx); + aclrtEvent event; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event)); + return event; + }; + + auto deleter = [dev_idx](aclrtEvent event) { + platform::SetNPUDeviceId(dev_idx); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +NpuEventResourcePool& NpuEventResourcePool::Instance() { + static NpuEventResourcePool pool; + return pool; +} + +std::shared_ptr NpuEventResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/npu_resource_pool.h b/paddle/fluid/platform/npu_resource_pool.h new file mode 100644 index 0000000000000..bfd6ec7f94112 --- /dev/null +++ b/paddle/fluid/platform/npu_resource_pool.h @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/platform/resource_pool.h" + +namespace paddle { +namespace platform { + +using NpuStreamObject = std::remove_pointer::type; +using NpuEventObject = std::remove_pointer::type; + +class NpuStreamResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static NpuStreamResourcePool &Instance(); + + private: + NpuStreamResourcePool(); + + DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool); + + private: + std::vector>> pool_; +}; + +class NpuEventResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static NpuEventResourcePool &Instance(); + + private: + NpuEventResourcePool(); + + DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool); + + private: + std::vector>> pool_; +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index cfa3c6906f83f..31193534a00be 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -21,6 +21,7 @@ message Event { enum EventType { CPU = 0; GPUKernel = 1; + NPUKernel = 2; } optional EventType type = 8; optional string name = 1; @@ -39,6 +40,8 @@ message MemEvent { CUDAPlace = 0; CPUPlace = 1; CUDAPinnedPlace = 2; + XPUPlace = 3; + NPUPlace = 4; } optional uint64 start_ns = 1; optional uint64 end_ns = 2; diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 287c8fc37e005..9f4ec9b3ce0d4 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -71,6 +71,8 @@ void StreamCallbackManager::AddCallback( #endif #if PADDLE_WITH_ASCEND_CL + VLOG(3) << "aclrtLaunchCallback at stream: " << stream_; + // TODO(zhiqiu): failed to call aclrtLaunchCallback PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func, ACL_CALLBACK_BLOCK, stream_)); #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 39a78d86976ae..038bcc7f85099 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -109,6 +109,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/npu_profiler.h" #endif #ifdef PADDLE_WITH_XPU @@ -581,11 +582,6 @@ PYBIND11_MODULE(core_noavx, m) { make_ddim(x_dim), make_ddim(y_dim), -1)); }); -#ifdef PADDLE_WITH_ASCEND_CL - m.def("_npu_finalize", - []() { platform::AclInstance::Instance().Finalize(); }); -#endif - m.def( "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { @@ -1744,7 +1740,7 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use NPU because you have installed CPU/GPU version " "PaddlePaddle.\n" "If you want to use NPU, please try to install NPU version " - "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "PaddlePaddle by: pip install paddlepaddle-npu\n" "If you only have CPU, please change NPUPlace(%d) to be " "CPUPlace().\n", dev_id); @@ -2180,6 +2176,31 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL + m.def("get_npu_device_count", platform::GetNPUDeviceCount); + m.def("_npu_finalize", []() { + platform::AclInstance::Instance().Finalize(); + }); // private interface + + py::class_(m, "NPUProfConfigWrapper"); + + m.def("npu_prof_init", platform::NPUProfilerInit); + m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerStart(c.ptr()); + }); + m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerStop(c.ptr()); + }); + m.def("npu_prof_finalize", platform::NPUProfilerFinalize); + m.def("npu_prof_create_config", []() { + return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig()); + }); + + m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerDestroyConfig(c.ptr()); + }); +#endif + py::enum_(m, "TracerOption", py::arithmetic()) .value("kDefault", platform::TracerOption::kDefault) .value("kOpDetail", platform::TracerOption::kOpDetail) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index ab1dd8a180b5b..416361d06a996 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -663,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, } bool is_gpu_tensor = platform::is_gpu_place(tensor.place()); bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); + bool is_npu_tensor = platform::is_npu_place(tensor.place()); const auto &tensor_dims = tensor.dims(); auto tensor_dtype = tensor.type(); size_t sizeof_dtype = framework::SizeOfType(tensor_dtype); @@ -681,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type()); - if (!is_gpu_tensor && !is_xpu_tensor) { + if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) { if (!need_deep_copy) { auto base = py::cast(std::move(tensor)); return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides, @@ -749,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); +#endif + } else if (is_npu_tensor) { +#ifdef PADDLE_WITH_ASCEND_CL + py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); + PADDLE_ENFORCE_EQ(py_arr.writeable(), true, + platform::errors::InvalidArgument( + "PyArray is not writable, in which case memory leak " + "or double free would occur")); + PADDLE_ENFORCE_EQ( + py_arr.owndata(), true, + platform::errors::InvalidArgument( + "PyArray does not own data, in which case memory leak " + "or double free would occur")); + + size_t copy_bytes = sizeof_dtype * numel; + auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place()); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(tensor.place()); + paddle::memory::Copy( + platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr, + copy_bytes, + reinterpret_cast(ctx).stream()); + ctx.Wait(); + return py_arr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with NPU support.")); #endif } PADDLE_THROW(platform::errors::Unimplemented("Place is not supported")); diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index cfb4b12599385..768248e136b05 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -2041,6 +2041,10 @@ def set_var(var, ndarray): p = paddle.fluid.core.Place() p.set_place(t._place()) place = paddle.fluid.XPUPlace(p.xpu_device_id()) + elif p.is_npu_place(): + p = paddle.fluid.core.Place() + p.set_place(t._place()) + place = paddle.fluid.NPUPlace(p.npu_device_id()) else: p = paddle.fluid.core.Place() p.set_place(t._place()) @@ -2335,6 +2339,10 @@ def set_program_state(program, state_dict): p = paddle.fluid.core.Place() p.set_place(ten_place) py_place = paddle.fluid.XPUPlace(p.xpu_device_id()) + elif ten_place.is_npu_place(): + p = paddle.fluid.core.Place() + p.set_place(ten_place) + py_place = paddle.fluid.NPUPlace(p.npu_device_id()) ten.set(new_para_np, py_place) diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index bc7a60af94617..40b0862be0177 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None): os.remove(config_file) +@signature_safe_contextmanager +def npu_profiler(output_file, config=None): + """ + The NPU profiler. + + This fuctions is used to profile NPU program by NPU runtime application + programming interface. The profiling result will be written into + `output_file`. The users can set set the NPU profiling config by `config` argument. + + After getting the profiling result file, users can use + `tools provided by Ascend `_ + to load this output file to visualize results. + + Args: + output_file (str) : The output file name, the result will be + written into this file. It should be absolute path. + config (list, optional) : NPU profile config. For more details, please + refer to `User Guide `_ . + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle.fluid.profiler as profiler + import numpy as np + + epoc = 8 + dshape = [4, 3, 28, 28] + data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32') + conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) + + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + output_file = 'npu.txt' + with profiler.npu_profiler(output_file) as npu_prof: + for i in range(epoc): + input = np.random.random(dshape).astype('float32') + exe.run(fluid.default_main_program(), feed={'data': input}) + # then use NPU profiler tools to load this output file + # to visualize results. + """ + # TODO: support config in python. + if not config: + config = core.npu_prof_create_config() + + core.npu_prof_init(output_file) + # Enables profiler collection by the active NPU profiling tool. + core.npu_prof_start(config) + try: + yield + # Disables profiler collection. + finally: + core.npu_prof_stop(config) + core.npu_prof_finalize() + + def reset_profiler(): """ Clear the previous time record. This interface does not work for diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py index 4cda0ceeccf9c..ac80ea4c62cbf 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py @@ -14,6 +14,8 @@ import unittest import numpy as np +import sys +sys.path.append("..") from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py index b39771e29c7b4..ae48866b7b969 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py @@ -73,5 +73,27 @@ def test_check_output(self): self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) +class TestCast3(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "cast" + self.place = paddle.NPUPlace(0) + + ipt = np.random.random(size=[10, 10]) + 1 + self.inputs = {'X': ipt.astype('int32')} + self.outputs = {'Out': ipt.astype('int32')} + + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.INT32), + 'out_dtype': int(core.VarDesc.VarType.INT32) + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py index a102f3d9ce185..3e2e8f944b84c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py @@ -26,7 +26,7 @@ paddle.enable_static() SEED = 2021 -NPUPlace = 5 +NPUPlace = 0 @unittest.skipIf(not paddle.is_compiled_with_npu(), diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index 400ddd9d4aab0..2463ddb7137ac 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -41,7 +41,7 @@ def setUp(self): vocab = 10 dim = 20 w = np.ones([vocab, dim]).astype(self.dtype) - x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64) + x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32) out = np.ones([bsz, seqlen, dim]).astype(self.dtype) self.inputs = { diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py new file mode 100755 index 0000000000000..63c4fb8e5885e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import compiler, Program, program_guard + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMemcpy_FillConstant(unittest.TestCase): + def get_prog(self): + paddle.enable_static() + main_program = Program() + with program_guard(main_program): + cpu_var_name = "tensor@Cpu" + npu_var_name = "tensor@Npu" + cpu_var = main_program.global_block().create_var( + name=cpu_var_name, + shape=[10, 10], + dtype='float32', + persistable=False, + stop_gradient=True) + npu_var = main_program.global_block().create_var( + name=npu_var_name, + shape=[10, 10], + dtype='float32', + persistable=False, + stop_gradient=True) + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": npu_var_name}, + attrs={ + "shape": [10, 10], + "dtype": npu_var.dtype, + "value": 1.0, + "place_type": 1 + }) + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": cpu_var_name}, + attrs={ + "shape": [10, 10], + "dtype": cpu_var.dtype, + "value": 0.0, + "place_type": 2 + }) + return main_program, npu_var, cpu_var + + def test_npu_cpoy_to_cpu(self): + main_program, npu_var, cpu_var = self.get_prog() + main_program.global_block().append_op( + type='memcpy', + inputs={'X': npu_var}, + outputs={'Out': cpu_var}, + attrs={'dst_place_type': 0}) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + npu_, cpu_ = exe.run(main_program, + feed={}, + fetch_list=[npu_var.name, cpu_var.name]) + self.assertTrue(np.allclose(npu_, cpu_)) + self.assertTrue(np.allclose(cpu_, np.ones((10, 10)))) + + def test_cpu_cpoy_npu(self): + main_program, npu_var, cpu_var = self.get_prog() + main_program.global_block().append_op( + type='memcpy', + inputs={'X': cpu_var}, + outputs={'Out': npu_var}, + attrs={'dst_place_type': 4}) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + npu_, cpu_ = exe.run(main_program, + feed={}, + fetch_list=[npu_var.name, cpu_var.name]) + self.assertTrue(np.allclose(npu_, cpu_)) + self.assertTrue(np.allclose(npu_, np.zeros((10, 10)))) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index e65a3dac73928..4fcfd33b32f4e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -248,8 +248,9 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + self.assertTrue(np.allclose( + npu_pred, cpu_pred, atol=1e-5)) # atol needed on cann 20.3 + self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5)) @unittest.skipIf(not paddle.is_compiled_with_npu(), diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py index 087256b298088..583a648224d73 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py @@ -16,6 +16,8 @@ import unittest import numpy as np +import sys +sys.path.append("..") from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid.core as core diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py new file mode 100644 index 0000000000000..e7e7fb39c913b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py @@ -0,0 +1,108 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.nn import Embedding +import paddle.fluid.framework as framework +from paddle.fluid.optimizer import Adam +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope +from paddle.fluid.executor import global_scope +import numpy as np +import six +import pickle +import os +import errno +from test_static_save_load import * + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPUSaveLoadBase(TestSaveLoadBase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPUSaveLoadPartial(TestSaveLoadPartial): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPUProgramStatePartial(TestProgramStatePartial): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPULoadFromOldInterface(TestLoadFromOldInterface): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPUProgramStateOldSave(TestProgramStateOldSave): + def setUp(self): + self.test_dygraph = False + + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_npu( + ) else paddle.NPUPlace(0) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py index f00a3c103c817..ed21549b7e01f 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py @@ -36,7 +36,7 @@ def setUp(self): self.op_type = "assign" self.init_dtype() - x = np.rand.random([3, 3]) + x = np.random.random([3, 3]).astype(self.dtype) self.inputs = {'X': x} self.attrs = {} @@ -46,7 +46,7 @@ def set_npu(self): self.__class__.use_npu = True def init_dtype(self): - self.dtype = np.int64 + self.dtype = np.float32 def test_check_output(self): self.check_output_with_place(self.place, check_dygraph=False) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py index 8fd250f2a52c2..f5bccf7ab09b6 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py @@ -101,7 +101,7 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True): class TestStaticDataLoader(unittest.TestCase): - def run_main(self, num_workers, places): + def run_main(self, num_workers, places, use_pe=True): scope = fluid.Scope() with fluid.scope_guard(scope): startup_prog, main_prog, image, label, loss = simple_fc_net_static() @@ -120,10 +120,13 @@ def run_main(self, num_workers, places): exe = fluid.Executor(place=places[0]) exe.run(startup_prog) - prog = fluid.CompiledProgram(main_prog) - if len(places) > 1: - prog = prog.with_data_parallel( - loss_name=loss.name, places=places) + if use_pe: + prog = fluid.CompiledProgram(main_prog) + if len(places) > 1: + prog = prog.with_data_parallel( + loss_name=loss.name, places=places) + else: + prog = main_prog step_list = [] loss_list = [] diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py index 51c543c5f7464..cfce0bb7d311b 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py @@ -19,7 +19,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core -from paddle.fluid.dygraph.nn import Embedding +from paddle.nn import Embedding import paddle.fluid.framework as framework from paddle.fluid.optimizer import Adam from paddle.fluid.dygraph.base import to_variable @@ -31,6 +31,8 @@ import os import errno +paddle.enable_static() + class SimpleLSTMRNN(fluid.Layer): def __init__(self, @@ -159,11 +161,10 @@ def __init__(self, num_layers=num_layers, init_scale=init_scale, dropout=dropout) - self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=False, - param_attr=fluid.ParamAttr( + self.embedding = paddle.nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=hidden_size, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale))) @@ -187,6 +188,8 @@ def forward(self, input, label, init_hidden, init_cell): init_c = fluid.layers.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size]) + # NPU 'tok_k' kernel only support `int32` dtype, so cast `input` from `int64` to `int32`. + input = fluid.layers.cast(input, "int32") x_emb = self.embedding(input) x_emb = fluid.layers.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size]) @@ -214,6 +217,10 @@ def forward(self, input, label, init_hidden, init_cell): class TestSaveLoadBase(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -235,8 +242,7 @@ def test_ptb_rnn_cpu_float32(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -315,6 +321,10 @@ def test_ptb_rnn_cpu_float32(self): class TestSaveLoadPartial(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -336,8 +346,7 @@ def test_ptb_rnn_cpu_float32(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -425,6 +434,10 @@ def test_ptb_rnn_cpu_float32(self): class TestSaveLoadSetStateDict(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -446,8 +459,7 @@ def test_ptb_rnn_cpu_float32(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -526,6 +538,10 @@ def test_ptb_rnn_cpu_float32(self): class TestProgramStatePartial(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -547,8 +563,7 @@ def test_ptb_rnn_cpu_float32(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -708,14 +723,17 @@ def test_ptb_rnn_cpu_float32(self): class TestVariableInit(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_variable_init(self): x = fluid.data(name="x", shape=[10, 10], dtype='float32') y = fluid.layers.fc(x, 10) z = fluid.layers.fc(y, 10) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -738,8 +756,7 @@ def set_var(var, ndarray): program = fluid.default_main_program() new_scope = fluid.core.Scope() - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) parameter_list = list( filter(fluid.io.is_parameter, program.list_vars())) @@ -798,6 +815,10 @@ def setUp(self): if os.path.exists("test_static_load_var_list.pdparams"): os.remove("test_static_load_var_list.pdparams") + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_load_from_old_interface(self): seed = 90 hidden_size = 10 @@ -819,8 +840,7 @@ def test_load_from_old_interface(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -935,8 +955,7 @@ def test_load_from_old_interface_var_list(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -1027,6 +1046,10 @@ def test_load_from_old_interface_var_list(self): class TestLoadFromOldInterfaceSingleFile(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_load_from_old_interface(self): seed = 90 hidden_size = 10 @@ -1048,8 +1071,7 @@ def test_load_from_old_interface(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -1171,6 +1193,13 @@ def test_load_from_old_interface(self): class TestProgramStateOldSave(unittest.TestCase): + def setUp(self): + self.test_dygraph = True + + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -1192,8 +1221,7 @@ def test_ptb_rnn_cpu_float32(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( @@ -1299,11 +1327,12 @@ def symlink_force(target, link_name): fluid.set_program_state(main_program, program_state) self.check_in_static(main_program, base_map) - # make sure `load_program_state` can be used in dynamic graph mode - with fluid.dygraph.guard(place): - load_state = fluid.load_program_state("test_program_1") - for k, v in load_state.items(): - self.assertTrue(np.array_equal(base_map[k], v)) + if self.test_dygraph: + # make sure `load_program_state` can be used in dynamic graph mode + with fluid.dygraph.guard(place): + load_state = fluid.load_program_state("test_program_1") + for k, v in load_state.items(): + self.assertTrue(np.array_equal(base_map[k], v)) def create_symlink(self, target, link_name): try: @@ -1323,6 +1352,10 @@ def check_in_static(self, main_program, base_map): class TestProgramStateOldSaveSingleModel(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -1344,8 +1377,7 @@ def test_ptb_rnn_cpu_float32(self): num_steps=num_steps, init_scale=init_scale) - place = fluid.CPUPlace() if not core.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) + place = self.set_place() exe = fluid.Executor(place) sgd = Adam(learning_rate=1e-3) x = fluid.layers.data( diff --git a/tools/timeline.py b/tools/timeline.py index 119018380b551..2a399b71b7786 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -186,6 +186,13 @@ def _allocate_pids(self): self._chrome_trace.emit_pid( "memory usage on %s:cudapinnedplace:%d" % (k, mevent.device_id), pid) + elif mevent.place == profiler_pb2.MemEvent.NPUPlace: + if (k, mevent.device_id, "NPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, mevent.device_id, "NPU")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:npu:%d" % (k, mevent.device_id), + pid) if (k, 0, "CPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "CPU")] = pid @@ -201,6 +208,11 @@ def _allocate_pids(self): self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid self._chrome_trace.emit_pid( "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid) + if (k, 0, "NPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, 0, "NPU")] = pid + self._chrome_trace.emit_pid("memory usage on %s:npu:%d" % + (k, 0), pid) def _allocate_events(self): for k, profile_pb in six.iteritems(self._profile_dict): @@ -227,7 +239,8 @@ def _allocate_memory_event(self): place_to_str = { profiler_pb2.MemEvent.CPUPlace: "CPU", profiler_pb2.MemEvent.CUDAPlace: "GPU", - profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace" + profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace", + profiler_pb2.MemEvent.NPUPlace: "NPU" } for k, profile_pb in six.iteritems(self._profile_dict): mem_list = []