Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bf16] add bf16 kernel: dropout & reshape & slice #39395

Merged
merged 9 commits into from
Feb 10, 2022
8 changes: 6 additions & 2 deletions paddle/fluid/operators/dropout_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,12 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>);
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
dropout_grad,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>);
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
3 changes: 3 additions & 0 deletions paddle/fluid/operators/dropout_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/dropout_impl.cu.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
Expand Down Expand Up @@ -84,8 +85,10 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
11 changes: 7 additions & 4 deletions paddle/fluid/operators/reshape_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -698,27 +698,30 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
uint8_t, ops::ReshapeKernel, int64_t,
ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel, plat::bfloat16,
ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel, uint8_t,
ops::ReshapeGradKernel, plat::float16,

ops::ReshapeGradKernel, plat::bfloat16,
ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
uint8_t, ops::ReshapeKernel, int64_t,
ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel, bool, ops::ReshapeKernel,
plat::complex<float>, ops::ReshapeKernel,
plat::complex<double>, ops::ReshapeKernel);
plat::complex<double>, ops::ReshapeKernel,
plat::bfloat16, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(
reshape2_grad, float, ops::ReshapeGradKernel, double,
ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel);
ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel,
plat::bfloat16, ops::ReshapeGradKernel);

REGISTER_OP_CUDA_KERNEL_FUNCTOR(
reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
Expand All @@ -727,7 +730,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(
plat::float16, ops::ReshapeDoubleGradKernel, bool,
ops::ReshapeDoubleGradKernel, plat::complex<float>,
ops::ReshapeDoubleGradKernel, plat::complex<double>,
ops::ReshapeDoubleGradKernel);
ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel);
#endif

#ifdef PADDLE_WITH_XPU
Expand Down
12 changes: 10 additions & 2 deletions paddle/fluid/operators/slice_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,9 @@ REGISTER_OP_CPU_KERNEL(
ops::SliceKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::SliceKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
paddle::platform::complex<double>>,
ops::SliceKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);

REGISTER_OP_CPU_KERNEL(
slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
Expand All @@ -453,7 +455,9 @@ REGISTER_OP_CPU_KERNEL(
ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
paddle::platform::complex<double>>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);

REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, bool>,
Expand All @@ -463,6 +467,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::SliceKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::SliceKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::SliceKernel<paddle::platform::CUDADeviceContext,
Expand All @@ -476,6 +482,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
Expand Down
2 changes: 2 additions & 0 deletions paddle/pten/kernels/funcs/eigen/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/common/bfloat16.h"
#include "paddle/pten/common/complex.h"
#include "paddle/pten/kernels/funcs/eigen/eigen_function.h"

Expand Down Expand Up @@ -61,6 +62,7 @@ INSTANTIATION(EigenPad, int);
INSTANTIATION(EigenPad, int64_t);
INSTANTIATION(EigenPad, float);
INSTANTIATION(EigenPad, double);
INSTANTIATION(EigenPad, dtype::bfloat16);
INSTANTIATION(EigenPad, dtype::complex<float>);
INSTANTIATION(EigenPad, dtype::complex<double>);
#undef INSTANTIATION
Expand Down
23 changes: 22 additions & 1 deletion python/paddle/fluid/tests/unittests/test_dropout_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci
from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
import paddle
import paddle.static as static
import paddle.fluid as fluid
Expand Down Expand Up @@ -233,6 +233,27 @@ def init_test_case(self):
self.fix_seed = False


class TestBF16DropoutOp(OpTest):
def setUp(self):
self.op_type = "dropout"
self.dtype = np.uint16

x = np.random.random((32, 64)).astype("float32")
self.inputs = {'X': convert_float_to_uint16(x)}
self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
self.outputs = {
'Out':
convert_float_to_uint16(np.zeros((32, 64)).astype('float32')),
'Mask': np.zeros((32, 64)).astype('uint8')
}

def test_check_output(self):
self.check_output()

def test_check_grad_normal(self):
self.check_grad(['X'], 'Out')


class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
def test_seed_cpu_place(self):
paddle.enable_static()
Expand Down
30 changes: 29 additions & 1 deletion python/paddle/fluid/tests/unittests/test_reshape_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
import unittest
import numpy as np

from op_test import OpTest
from op_test import OpTest, convert_float_to_uint16
import paddle
import paddle.fluid as fluid
from paddle.fluid import compiler
from paddle.static import Program, program_guard
import paddle.fluid.core as core


# situation 1: have shape( list, no tensor), no actual shape(Tensor)
Expand All @@ -48,6 +49,33 @@ def test_check_grad(self):
self.check_grad(["X"], "Out")


class TestReshapeBF16Op(OpTest):
def setUp(self):
self.init_data()
self.op_type = "reshape2"
self.dtype = np.uint16
x = np.random.random(self.ori_shape).astype("float32")
out = x.reshape(self.infered_shape)
self.inputs = {"X": convert_float_to_uint16(x)}
self.attrs = {"shape": self.new_shape}
self.outputs = {
"Out": convert_float_to_uint16(out),
'XShape': convert_float_to_uint16(
np.random.random(self.ori_shape).astype("float32"))
}

def init_data(self):
self.ori_shape = (2, 60)
self.new_shape = (12, 10)
self.infered_shape = (12, 10)

def test_check_output(self):
self.check_output(no_check_set=['XShape'])

def test_check_grad(self):
self.check_grad(["X"], "Out")


class TestReshapeOpDimInfer1(TestReshapeOp):
def init_data(self):
self.ori_shape = (5, 25)
Expand Down
31 changes: 30 additions & 1 deletion python/paddle/fluid/tests/unittests/test_slice_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from op_test import OpTest, convert_float_to_uint16
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle
Expand Down Expand Up @@ -484,6 +484,35 @@ def test_check_grad_normal(self):
numeric_grad_delta=0.5)


class TestBF16(OpTest):
def setUp(self):
self.op_type = "slice"
self.config()
self.inputs = {'Input': convert_float_to_uint16(self.input)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
self.attrs = {
'axes': self.axes,
'starts': self.starts,
'ends': self.ends,
'infer_flags': self.infer_flags
}

def config(self):
self.dtype = np.uint16
self.input = np.random.random([3, 4, 5, 6]).astype(np.float32)
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.out = self.input[-3:3, 0:100, :, 2:-1]
self.infer_flags = [1, 1, 1]

def test_check_output(self):
self.check_output()

def test_check_grad_normal(self):
self.check_grad(['Input'], 'Out')


# Test python API
class TestSliceAPI(unittest.TestCase):
def test_1(self):
Expand Down