Skip to content

Commit

Permalink
Revert "[NPU] Fix the performance problem when 'axis' is not specified (
Browse files Browse the repository at this point in the history
PaddlePaddle#35116) (PaddlePaddle#35301)"

This reverts commit 2931df5.

Revert "[cherry-pick][hybrid performance] optim npu coalesce set constant (PaddlePaddle#35105) (PaddlePaddle#35302)"

This reverts commit 12260bd.

Revert "[cherry-pick][hybrid performance] optim the grad fuse for pipeline mode by sorting the grad by dtype (PaddlePaddle#35070) (PaddlePaddle#35300)"

This reverts commit e69cc21.

Revert "[cherry-pick][hybrid performance] Grad fuse for gradient merge under pipeline mode (PaddlePaddle#35004) (PaddlePaddle#35299)"

This reverts commit e931cd1.

Revert "Add flags to control whether to check Nan value of hccl_allreduce_sum. (PaddlePaddle#35093) (PaddlePaddle#35298)"

This reverts commit d4948bc.

Revert "[hybrid] Fix row parallel linear bias (PaddlePaddle#35186) (PaddlePaddle#35297)"

This reverts commit b36fb03.

Revert "[hybrid][npu] fix npu clear float status in pipeline (PaddlePaddle#35165) (PaddlePaddle#35295)"

This reverts commit 167685e.

Revert "[hybrid npu] fix npu found_finite in hybrid (PaddlePaddle#35134) (PaddlePaddle#35291)"

This reverts commit e64105f.

Revert "[cherry-pick][Hybrid Performance] Move the cast op of AMP which cast fp32 param to fp16 param to the optimizer (PaddlePaddle#34965) (PaddlePaddle#35296)"

This reverts commit 6fb58ae.

Revert "[cherry-pick] NPU use squared_l2_norm in GradientClipByGlobalNorm (PaddlePaddle#34836) (PaddlePaddle#35289)"

This reverts commit 38c27d5.
  • Loading branch information
FeixLiu committed Sep 3, 2021
1 parent 2931df5 commit f6560cc
Show file tree
Hide file tree
Showing 18 changed files with 89 additions and 918 deletions.
2 changes: 0 additions & 2 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ message ShardingConfig {
optional bool optimize_offload = 9 [ default = false ];
optional bool pp_allreduce_in_optimize = 10 [ default = false ];
optional int32 pp_degree = 11 [ default = 1 ];
optional bool optimize_cast = 12 [ default = false ];
}

message HybridConfig {
Expand Down Expand Up @@ -200,7 +199,6 @@ message DistributedStrategy {
optional int32 fuse_grad_size_in_num = 31 [ default = 8 ];
optional bool calc_comm_same_stream = 32 [ default = false ];
optional bool asp = 33 [ default = false ];
optional bool fuse_grad_merge = 34 [ default = false ];

optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
Expand Down
84 changes: 6 additions & 78 deletions paddle/fluid/operators/coalesce_tensor_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,67 +20,10 @@
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/npu_op_runner.h"
#endif

namespace paddle {
namespace operators {

template <typename DeviceContext>
struct FillConstantVisitor {
FillConstantVisitor(const DeviceContext &dev_ctx,
framework::LoDTensor *tensor, const float value,
framework::proto::VarType::Type dtype,
const framework::ExecutionContext &context)
: dev_ctx_(dev_ctx),
tensor_(tensor),
value_(value),
dtype_(dtype),
context_(context) {}

template <typename T>
void apply(typename std::enable_if<std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value>::type * =
nullptr) const {
PADDLE_THROW(platform::errors::InvalidArgument(
"Not support data type for set_constant attr"));
}

template <typename T>
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value)>::type
* = nullptr) const {
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(dev_ctx_.GetPlace())) {
Tensor tensor_tmp(dtype_);
tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));

const auto &runner =
NpuOpRunner("FillD", {tensor_tmp}, {*tensor_},
{{"dims", framework::vectorize(tensor_->dims())}});
auto stream =
context_.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
} else {
math::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
}
#else
math::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
#endif
}

const DeviceContext &dev_ctx_;
framework::LoDTensor *tensor_;
float value_;
framework::proto::VarType::Type dtype_;
const framework::ExecutionContext &context_;
};

template <typename DeviceContext, typename T>
class CoalesceTensorOpKernel : public framework::OpKernel<T> {
public:
Expand Down Expand Up @@ -127,7 +70,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
bool use_align = context.Attr<bool>("use_align");
auto align_size = context.Attr<int>("align_size");
auto size_of_dtype = context.Attr<int>("user_defined_size_of_dtype");

if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
Expand All @@ -152,9 +94,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
size_t numel = 0;
auto dtype = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
if (size_of_dtype == -1) {
size_of_dtype = framework::SizeOfType(dtype);
}
size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace(), use_align, align_size);

Expand All @@ -181,10 +121,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
: len;
}
} else if (context.Attr<bool>("set_constant")) {
framework::VisitDataType(
dtype, FillConstantVisitor<DeviceContext>(
dev_ctx, fused_tensor, context.Attr<float>("constant"),
dtype, context));
// TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
math::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx, fused_tensor,
static_cast<T>(context.Attr<float>("constant")));
} else if (context.Attr<bool>("persist_output")) {
for (size_t i = 0; i < out_var_names.size(); ++i) {
size_t len = static_cast<size_t>(out_tensors[i]->numel());
Expand Down Expand Up @@ -287,13 +227,10 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
}
auto use_align = ctx->Attrs().Get<bool>("use_align");
auto align_size = ctx->Attrs().Get<int>("align_size");
auto size_of_dtype = ctx->Attrs().Get<int>("user_defined_size_of_dtype");

auto dtype = static_cast<framework::proto::VarType::Type>(
ctx->Attrs().Get<int>("dtype"));
if (size_of_dtype == -1) {
size_of_dtype = framework::SizeOfType(dtype);
}
size_t size_of_dtype = framework::SizeOfType(dtype);

auto alignment = [](size_t size, size_t align_size) {
size_t remaining = size % align_size;
Expand Down Expand Up @@ -371,15 +308,6 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(true);
AddAttr<int>("align_size", "The alignment size when use_align is True")
.SetDefault(-1);
AddAttr<int>("user_defined_size_of_dtype",
"The user defined size of dtype. This is used to coalesce "
"grad vars and merged_grad vars at the same time. For some "
"strategy, the dtype of fused_grad_vars and the dtype of "
"fused_grad_merged_vars are not identical, which will cause "
"the shape of these two coalesced vars are different. To "
"make sure the shape of these two vars are identical with "
"each other, this attr is added.")
.SetDefault(-1);
AddComment(R"DOC(
CoalesceTensor Operator.
Expand Down
13 changes: 3 additions & 10 deletions paddle/fluid/operators/collective/c_allreduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h"
#endif

#if defined(PADDLE_WITH_ASCEND_CL)
DECLARE_bool(hccl_check_nan);
#endif

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -144,7 +140,6 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
try {
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
runner_mean.Run(stream);
TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "ContainsNan catch exception";
Expand Down Expand Up @@ -238,11 +233,9 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
break;
}
case framework::proto::VarType::FP32: {
if (FLAGS_hccl_check_nan) {
VLOG(3) << "prepare to FoundNanInf";
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(3) << "check_numerics:" << found_nan;
}
VLOG(4) << "prepare to FoundNanInf";
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << found_nan;
break;
}
default:
Expand Down
21 changes: 13 additions & 8 deletions paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,27 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute = x_dims.size() == (y_dims.size() + axis);
direct_compute =
y_dims == framework::slice_ddim(x_dims, axis, x_dims.size());
} else {
direct_compute = y_dims.size() == (x_dims.size() + axis);
direct_compute =
x_dims == framework::slice_ddim(y_dims, axis, y_dims.size());
}

Tensor transformed_x, transformed_y;
if (direct_compute) {
const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
runner.Run(dev_ctx.stream());
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
} else {
Tensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &transformed_x,
&transformed_y);
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
runner.Run(dev_ctx.stream());
}
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};

Expand Down
3 changes: 0 additions & 3 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ DEFINE_string(selected_npus, "",
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, false,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value");
DEFINE_string(
npu_config_path, "",
"The absolute path of configuration json file, like: /tmp/config.json. "
Expand Down
60 changes: 25 additions & 35 deletions python/paddle/distributed/collective.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,19 +1078,6 @@ def _linear(x, weight, bias=None, name=None):
return res


def _set_var_distributed(var):
if var is None:
return

var.is_distributed = True

# NOTE: use current_block and find_var_recursive to support while_loop
startup_block = paddle.static.default_startup_program().current_block()
main_block = paddle.static.default_main_program().current_block()
startup_block._find_var_recursive(var.name).is_distributed = True
main_block._find_var_recursive(var.name).is_distributed = True


def _parallel_linear(x,
num_rows,
num_cols,
Expand All @@ -1108,7 +1095,7 @@ def _parallel_linear(x,
axis the dimension of the parameter of linear layer.
axis = 0: the row dimension
axis = 1: the col dimension
axid = 1: the col dimension
"""
if group is not None and not group.is_member():
Expand All @@ -1121,35 +1108,40 @@ def _parallel_linear(x,
else:
x = _c_identity(x, group=group)

linear = paddle.nn.Linear(
num_rows,
num_cols,
weight_attr=param_attr,
bias_attr=bias_attr,
name=name)
if core.is_compiled_with_npu():
linear = _Linear(
num_rows,
num_cols,
weight_attr=param_attr,
bias_attr=bias_attr,
name=name)
else:
linear = paddle.nn.Linear(
num_rows,
num_cols,
weight_attr=param_attr,
bias_attr=bias_attr,
name=name)

linear_out = linear(x)
startup_block = paddle.static.default_startup_program().current_block()
main_block = paddle.static.default_main_program().current_block()
startup_block._find_var_recursive(linear.weight.name).is_distributed = True
main_block._find_var_recursive(linear.weight.name).is_distributed = True

# NOTE: npu linear function use matmul_v2 but linear use matmul
linear_function = _linear if core.is_compiled_with_npu()\
else paddle.nn.functional.linear
linear_out = linear_function(
x,
linear.weight,
# NOTE(wangxi): row split, bias need add after allreduce
None if axis == 0 else linear.bias,
linear.name)

_set_var_distributed(linear.weight)
# set is_distributed for splited bias
# if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
# if a linear layer is splited by col, the bias would also be split into each rank as its weight
if axis == 1 and linear._bias_attr != False:
_set_var_distributed(linear.bias)
startup_block._find_var_recursive(
linear.bias.name).is_distributed = True
main_block._find_var_recursive(linear.bias.name).is_distributed = True

if not gather_out: return linear_out

op_type = 'c_allreduce_sum' if axis == 0 else 'c_concat'
out_shape = list(linear_out.shape)
out_shape[0] *= 1 if axis == 0 else nranks
main_block = paddle.static.default_main_program().current_block()
out = main_block.create_var(
shape=out_shape,
dtype=linear_out.dtype,
Expand All @@ -1168,8 +1160,6 @@ def _parallel_linear(x,
'use_calc_stream': True,
'use_model_parallel': True
})
if linear.bias is not None:
out = out + linear.bias
else:
main_block.append_op(
type='c_concat',
Expand Down
25 changes: 0 additions & 25 deletions python/paddle/distributed/fleet/base/distributed_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,9 +888,6 @@ def sharding_configs(self):
pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now.. Default is False.
optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
will take more memory, but will be faster, trade space for time. Recommend to turn on only when using pipeline or gradient_merge_acc_step large.
Examples:
Expand Down Expand Up @@ -967,28 +964,6 @@ def _calc_comm_same_stream(self, same):
"WARNING: calc_comm_same_stream should have value of boolean type"
)

@property
def fuse_grad_merge(self):
"""
Set whether fuse the grad for gradient merge.
Note: this flag will only effect the gradient merge under pipeline mode
The default value for the fuse_grad_merge is False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_param_grad = True
"""
return self.strategy.fuse_grad_merge

@fuse_grad_merge.setter
@is_strict_auto
def fuse_grad_merge(self, fuse_grad_merge):
if isinstance(fuse_grad_merge, bool):
self.strategy.fuse_grad_merge = fuse_grad_merge
else:
print("WARNING: fuse_grad_merge should have value of boolean type")

@property
def fuse_grad_size_in_num(self):
"""
Expand Down
Loading

0 comments on commit f6560cc

Please sign in to comment.