Skip to content

Commit

Permalink
Optimize layer norm backward cuda kernel when cols is 1024. (#39247)
Browse files Browse the repository at this point in the history
* Add fp16 support for scale/bias for fused_layernnorm_residual_dropout_bias op.

* Remove useless code.

* Remove useless code.

* Optimize layer_norm fwd when cols is 1024.

* Remove useless code.

* Minors.

* Minors.

* Modifications accordding to reviews.

* Minors.

* Optimize layer_norm bwd kernel when cols is 1024.

* Polish layer_norm_bwd_1024 kernel.

* Limit ln_bwd_1024_kernel to paddle_with_cuda.

* Fix double type compile error.

* Add optimization of ln bwd for fused_dropout_add_ln op.

* Polish codes.
  • Loading branch information
limin2021 committed Jan 29, 2022
1 parent 92da505 commit 99cfcc0
Show file tree
Hide file tree
Showing 3 changed files with 570 additions and 41 deletions.
29 changes: 24 additions & 5 deletions paddle/fluid/operators/fused/fused_dropout_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,30 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
P* d_layernorm_bias, T* d_dropout_src,
T* d_bias, T* d_residual) {
using U = LayerNormParamType<T>;
LayerNormBackward<T, U, is_same_type>(
layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
d_residual, d_bias);
bool can_call_1024_kernel = false;
// Fast impl for cases when cols is 1024 and linear_bias is nullptr.
// In fact, linear_bias is not nullptr is also feasible for impl.
// Here, we do not support it.
if (this->cols_ == 1024 && d_bias == nullptr && d_scale != nullptr &&
d_layernorm_bias != nullptr && sizeof(T) <= 4) {
can_call_1024_kernel = true;
}
VLOG(6) << "LaunchLayernormResidualDropoutGrad = " << can_call_1024_kernel;

if (can_call_1024_kernel) {
LaunchLayernormResidualDropoutGrad<T, U, MaskType, is_same_type>(
ctx, this->rows_, this->cols_, epsilon_,
this->dropout_param_.dropout_prob,
this->dropout_param_.is_upscale_in_train, d_out, layernorm_src, gamma,
mean, variance, mask, d_scale, d_layernorm_bias, d_residual,
d_dropout_src);
} else {
LayerNormBackward<T, U, is_same_type>(
layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
d_residual, d_bias);
}
}

protected:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -441,5 +441,30 @@ void LaunchLayernormResidualDropoutBias(
}
}

template <typename T, typename U, typename MaskType,
bool ScaleBiasWithSameTypeX = false>
void LaunchLayernormResidualDropoutGrad(
const platform::CUDADeviceContext &dev_ctx, const uint32_t rows,
const uint32_t cols, const float epsilon, const float dropout_prob,
const bool is_upscale_in_train, const T *d_out, const T *layernorm_src,
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
const LayerNormParamType<T> *mean, const LayerNormParamType<T> *var,
const MaskType *mask_data,
LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_layernorm_bias,
T *d_residual, T *d_dropout_src) {
const T zero = static_cast<T>(0.0f);
auto factor = dropout_prob == static_cast<float>(1.0f)
? zero
: static_cast<T>(1.0f / (1.0f - dropout_prob));
if (!is_upscale_in_train) {
factor = static_cast<T>(1.0f);
}
ln_bwd_1024_kernel_driver<
T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, MaskType>(
dev_ctx, rows, cols, epsilon, layernorm_src, scale, mean, var, d_out,
d_residual, d_scale, d_layernorm_bias, mask_data, factor, d_dropout_src);
}

} // namespace operators
} // namespace paddle
Loading

0 comments on commit 99cfcc0

Please sign in to comment.