diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 8bab7e44dca4fa..1a235f1293f382 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -681,14 +681,6 @@ void prepare_buffer_fusing::run(program& p) { if (gather_prim) { update_dep(gather_prim); } - - // Fallback to ocl impl since oneDNN doesn't support dynamic paddings - for (auto user : node.get_users()) { - if (user->get_preferred_impl_type() == impl_types::onednn) { - GPU_DEBUG_TRACE_DETAIL << user->id() << ": change impl to ocl because of dynamic input paddings\n"; - user->set_preferred_impl_type(impl_types::ocl); - } - } } }); program_helpers::do_for_types(*node, [](read_value_node& node) { diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp index cdc78316b03d47..10c1a970d1793b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp @@ -64,6 +64,8 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::data_type& out_dt, dnnl::memory::dims& in0_dims, dnnl::memory::dims& in1_dims, + dnnl::memory::dims& in0_strides, + dnnl::memory::dims& in1_strides, dnnl::memory::dims& out_dims, dnnl::memory::format_tag& in0_fmt, dnnl::memory::format_tag& in1_fmt, @@ -111,6 +113,22 @@ struct gemm_onednn : typed_primitive_onednn_impl { in1_fmt = onednn::convert_gemm_data_format(in1_dims, in1_l.format); out_fmt = onednn::convert_gemm_data_format(out_dims, out_l.format); + if (in0_l.data_padding) { + dnnl::memory::dims in0_padded_dims = onednn::convert_gemm_tensor(in0_l.get_buffer_size(), rank, batched_dims_can_be_removed); + if (prim->transpose_input0) { + std::swap(in0_padded_dims[in0_padded_dims.size() - 1], in0_padded_dims[in0_padded_dims.size() - 2]); + } + in0_strides = onednn::get_strides(in0_padded_dims); + } + + if (in1_l.data_padding) { + dnnl::memory::dims in1_padded_dims = onednn::convert_gemm_tensor(in1_l.get_buffer_size(), rank, batched_dims_can_be_removed); + if (prim->transpose_input1) { + std::swap(in1_padded_dims[in1_padded_dims.size() - 1], in1_padded_dims[in1_padded_dims.size() - 2]); + } + in1_strides = onednn::get_strides(in1_padded_dims); + } + if (prim->transpose_input0) { in0_fmt = transpose_format(in0_fmt); std::swap(in0_dims[in0_dims.size() - 1], in0_dims[in0_dims.size() - 2]); @@ -130,6 +148,19 @@ struct gemm_onednn : typed_primitive_onednn_impl { } } + static dnnl::memory::desc get_input_memory_desc(const dnnl::memory::dims& dims, + dnnl::memory::data_type dt, + dnnl::memory::format_tag fmt, + const dnnl::memory::dims& strides) { + dnnl::memory::desc res; + if (strides.empty()) { + res = dnnl::memory::desc(dims, dt, fmt); + } else { + res = dnnl::memory::desc(dims, dt, strides); + } + return res; + } + static std::shared_ptr get_gemm_primitive_descriptor(const kernel_impl_params& impl_params, const dnnl::primitive_attr& attr = dnnl::primitive_attr()) { auto& engine = impl_params.prog->get_engine(); @@ -146,16 +177,19 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::dims out_dims; dnnl::memory::dims bias_dims; + dnnl::memory::dims in0_strides; + dnnl::memory::dims in1_strides; + dnnl::memory::format_tag in0_fmt; dnnl::memory::format_tag in1_fmt; dnnl::memory::format_tag out_fmt; dnnl::memory::format_tag bias_fmt; - get_gemm_primitive_md(impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, out_dims, in0_fmt, in1_fmt, out_fmt, - gemm_with_bias, bias_dt, bias_dims, bias_fmt); + get_gemm_primitive_md(impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, in0_strides, in1_strides, + out_dims, in0_fmt, in1_fmt, out_fmt, gemm_with_bias, bias_dt, bias_dims, bias_fmt); - dnnl::memory::desc in0_md(in0_dims, in0_dt, in0_fmt); - dnnl::memory::desc in1_md(in1_dims, in1_dt, in1_fmt); + dnnl::memory::desc in0_md = get_input_memory_desc(in0_dims, in0_dt, in0_fmt, in0_strides); + dnnl::memory::desc in1_md = get_input_memory_desc(in1_dims, in1_dt, in1_fmt, in1_strides); dnnl::memory::desc out_md(out_dims, out_dt, out_fmt); if (gemm_with_bias) { @@ -199,13 +233,16 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::dims out_dims; dnnl::memory::dims bias_dims; + dnnl::memory::dims in0_strides; + dnnl::memory::dims in1_strides; + dnnl::memory::format_tag in0_fmt; dnnl::memory::format_tag in1_fmt; dnnl::memory::format_tag out_fmt; dnnl::memory::format_tag bias_fmt; - get_gemm_primitive_md(*impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, out_dims, in0_fmt, in1_fmt, out_fmt, - gemm_with_bias, bias_dt, bias_dims, bias_fmt); + get_gemm_primitive_md(*impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, in0_strides, in1_strides, + out_dims, in0_fmt, in1_fmt, out_fmt, gemm_with_bias, bias_dt, bias_dims, bias_fmt); ob << make_data(&in0_dt, sizeof(dnnl::memory::data_type)); ob << make_data(&in1_dt, sizeof(dnnl::memory::data_type)); @@ -215,6 +252,9 @@ struct gemm_onednn : typed_primitive_onednn_impl { ob << in1_dims; ob << out_dims; + ob << in0_strides; + ob << in1_strides; + ob << make_data(&in0_fmt, sizeof(dnnl::memory::format_tag)); ob << make_data(&in1_fmt, sizeof(dnnl::memory::format_tag)); ob << make_data(&out_fmt, sizeof(dnnl::memory::format_tag)); @@ -248,6 +288,9 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::dims out_dims; dnnl::memory::dims bias_dims; + dnnl::memory::dims in0_strides; + dnnl::memory::dims in1_strides; + dnnl::memory::format_tag in0_fmt = dnnl::memory::format_tag::undef; dnnl::memory::format_tag in1_fmt = dnnl::memory::format_tag::undef; dnnl::memory::format_tag out_fmt = dnnl::memory::format_tag::undef; @@ -261,6 +304,9 @@ struct gemm_onednn : typed_primitive_onednn_impl { ib >> in1_dims; ib >> out_dims; + ib >> in0_strides; + ib >> in1_strides; + ib >> make_data(&in0_fmt, sizeof(dnnl::memory::format_tag)); ib >> make_data(&in1_fmt, sizeof(dnnl::memory::format_tag)); ib >> make_data(&out_fmt, sizeof(dnnl::memory::format_tag)); @@ -271,8 +317,8 @@ struct gemm_onednn : typed_primitive_onednn_impl { ib >> make_data(&bias_fmt, sizeof(dnnl::memory::format_tag)); } - dnnl::memory::desc in0_md(in0_dims, in0_dt, in0_fmt); - dnnl::memory::desc in1_md(in1_dims, in1_dt, in1_fmt); + dnnl::memory::desc in0_md = get_input_memory_desc(in0_dims, in0_dt, in0_fmt, in0_strides); + dnnl::memory::desc in1_md = get_input_memory_desc(in1_dims, in1_dt, in1_fmt, in1_strides); dnnl::memory::desc out_md(out_dims, out_dt, out_fmt); if (gemm_with_bias) { diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index f77b4469b1f619..6214a8db4d8255 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -94,6 +94,12 @@ dnnl::memory::dims flatten_tensor(cldnn::tensor t) { return {static_cast(t.count())}; } +dnnl::memory::dims get_strides(dnnl::memory::dims dims) { + dnnl::memory::dims strides(dims.size(), dnnl::memory::dim(1)); + std::partial_sum(dims.rbegin(), dims.rend() - 1, strides.rbegin() + 1, std::multiplies()); + return strides; +} + dnnl::memory::data_type convert_data_type(cldnn::data_types dt) { switch (dt) { case cldnn::data_types::f32: return dnnl::memory::data_type::f32; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp index a789107e2cf2bb..e8127b698f57d5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp @@ -28,6 +28,7 @@ dnnl::memory::dims convert_tensor(cldnn::tensor t, size_t dims = 2, bool is_grou dnnl::memory::dims convert_gemm_tensor(cldnn::tensor t, size_t dims, bool batched_dims_can_be_removed); dnnl::memory::dims convert_spatials(cldnn::tensor t, size_t dims = 2); dnnl::memory::dims flatten_tensor(cldnn::tensor t); +dnnl::memory::dims get_strides(dnnl::memory::dims dims); dnnl::memory::data_type convert_data_type(cldnn::data_types dt); dnnl::memory::format_tag convert_data_format(cldnn::format fmt); cldnn::format convert_data_format(dnnl::memory::format_tag fmt); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index 8ce9e294a867fe..180d8cdb036483 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -2527,150 +2527,302 @@ INSTANTIATE_TEST_SUITE_P(gemm_gpu, gemm_onednn_ndims, ::testing::ValuesIn(std::v gemm_onednn_test_params{ CASE_GEMM_ONEDNN_I8_6D }, })); -TEST(gemm_onednn, impl_replacement_with_cldnn) { - auto& engine = get_test_engine(); +class gemm_onednn: public ::testing::Test { +public: + void test_impl_replacement_with_cldnn() { + auto& engine = get_test_engine(); - if (!engine.get_device_info().supports_immad) - return; + if (!engine.get_device_info().supports_immad) + return; + + ov::Shape in1_shape = { 1, 1, 3, 4 }; + ov::Shape in2_shape = { 1, 4 }; + auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx}; + auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx}; + auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx}); + auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx}); + + std::vector input1_data = { + 1.f, -2.f, 3.f, -4.f, + 5.f, 6.f, 1.f, 2.f, + 3.f, 3.f, 2.f, -1.f, + }; + + std::vector input2_data = { + 2.f, 5.f, -4.f, -7.f, + }; + set_values(input1, input1_data); + set_values(input2, input2_data); + + std::vector out_data = { + 8.f, 22.f, 20.f + }; + + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, true, 1.0f, 0.0f, 4, 2) + ); + + ov::intel_gpu::ImplementationDesc fc_impl = { format::bfyx, "", impl_types::onednn }; + ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", fc_impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(true) }; + + network network(engine, topology, cfg); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + + auto inst = network.get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + + auto output = outputs.at("gemm").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + ASSERT_EQ(output_ptr.size(), (uint32_t)3); + for (uint32_t i = 0; i < out_data.size(); ++i) { + ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]); + } + + // WA: Call wait_all() to wait for all queued kernels compilation finish + network.get_program()->get_compilation_context().wait_all(); - ov::Shape in1_shape = { 1, 1, 3, 4 }; - ov::Shape in2_shape = { 1, 4 }; - auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx}; - auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx}; - auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx}); - auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx}); - - std::vector input1_data = { - 1.f, -2.f, 3.f, -4.f, - 5.f, 6.f, 1.f, 2.f, - 3.f, 3.f, 2.f, -1.f, - }; - - std::vector input2_data = { - 2.f, 5.f, -4.f, -7.f, - }; - set_values(input1, input1_data); - set_values(input2, input2_data); - - std::vector out_data = { - 8.f, 22.f, 20.f - }; - - topology topology; - topology.add(input_layout("input1", in1_layout), - input_layout("input2", in2_layout), - gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, true, 1.0f, 0.0f, 4, 2) - ); - - ov::intel_gpu::ImplementationDesc fc_impl = { format::bfyx, "", impl_types::onednn }; - ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), - ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", fc_impl} }), - ov::intel_gpu::optimize_data(true), - ov::intel_gpu::allow_new_shape_infer(true) }; - - network network(engine, topology, cfg); - network.set_input_data("input1", input1); - network.set_input_data("input2", input2); - - auto inst = network.get_primitive("gemm"); - auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic()); - - auto outputs = network.execute(); - - auto output = outputs.at("gemm").get_memory(); - cldnn::mem_lock output_ptr(output, get_test_stream()); - - ASSERT_EQ(output_ptr.size(), (uint32_t)3); - for (uint32_t i = 0; i < out_data.size(); ++i) { - ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]); + // Check if OneDNN's impl is used for the next execute() call + network.execute(); + inst = network.get_primitive("gemm"); + impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_FALSE(impl->is_dynamic()); } - // WA: Call wait_all() to wait for all queued kernels compilation finish - network.get_program()->get_compilation_context().wait_all(); + void test_check_transpose_format(const std::vector& permute_order) { + auto& engine = get_test_engine(); + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); - // Check if OneDNN's impl is used for the next execute() call - network.execute(); - inst = network.get_primitive("gemm"); - impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_FALSE(impl->is_dynamic()); -} + if (!engine.get_device_info().supports_immad) + return; -// Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy) -TEST(gemm_onednn, check_transpose_format_byfx) { - auto& engine = get_test_engine(); - tests::random_generator rg; - rg.set_seed(GET_SUITE_NAME); + auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); + auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); - if (!engine.get_device_info().supports_immad) - return; + topology topology; + topology.add(input_layout("input0", input0->get_layout())); + topology.add(permute("permute0", input_info("input0"), permute_order)); + topology.add(input_layout("input1", input1->get_layout())); + topology.add(permute("permute1", input_info("input1"), permute_order)); + topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true)); + + ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn }; + ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(false) }; + network network(engine, topology, config); + + auto input0_data = rg.generate_random_1d(input0->get_layout().count(), -1, 1); + auto input1_data = rg.generate_random_1d(input1->get_layout().count(), -1, 1); + + set_values(input0, input0_data); + set_values(input1, input1_data); - auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); - auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); + network.set_input_data("input0", input0); + network.set_input_data("input1", input1); - topology topology; - topology.add(input_layout("input0", input0->get_layout())); - topology.add(permute("permute0", input_info("input0"), {0, 2, 1, 3})); - topology.add(input_layout("input1", input1->get_layout())); - topology.add(permute("permute1", input_info("input1"), {0, 2, 1, 3})); - topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true)); + ASSERT_NO_FATAL_FAILURE(network.execute()); + } - ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn }; - ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order), - ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }), - ov::intel_gpu::optimize_data(true), - ov::intel_gpu::allow_new_shape_infer(false) }; - network network(engine, topology, config); + void test_dynamic_padding(bool n_dim_only) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); - auto input0_data = rg.generate_random_1d(input0->get_layout().count(), -1, 1); - auto input1_data = rg.generate_random_1d(input1->get_layout().count(), -1, 1); + auto& engine = get_test_engine(); - set_values(input0, input0_data); - set_values(input1, input1_data); + if (!engine.get_device_info().supports_immad) + return; - network.set_input_data("input0", input0); - network.set_input_data("input1", input1); + const unsigned long BATCH_SIZE = 31; + const unsigned long M_SIZE = 11; + const unsigned long K_SIZE = 37; + const unsigned long N_SIZE = 49; - ASSERT_NO_FATAL_FAILURE(network.execute()); -} + auto fill_mem = [&](cldnn::memory_ptr mem, std::vector& data) { + cldnn::mem_lock mem_ptr(mem, get_test_stream()); + auto&& l = mem->get_layout(); + auto data_idx = 0; + for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { + for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) { + for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { + for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { + auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0}; + auto buffer_idx = l.get_linear_offset(tensor_coord); + mem_ptr[buffer_idx] = data[data_idx++]; + } + } + } + } + }; -TEST(gemm_onednn, check_transpose_format_bxfy) { - auto& engine = get_test_engine(); - tests::random_generator rg; - rg.set_seed(GET_SUITE_NAME); + const auto align_size_m = 13; + const auto align_size_k = 16; + const auto align_size_n = 15; + const auto align_size_b1 = 3; + const auto align_size_b2 = 19; - if (!engine.get_device_info().supports_immad) - return; + const auto aligned_batch1_size = align_to(1ul, align_size_b1); + auto padding_size_batch1 = static_cast(aligned_batch1_size - 1); - auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); - auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); + const auto aligned_batch2_size = align_to(BATCH_SIZE, align_size_b2); + auto padding_size_batch2 = static_cast(aligned_batch2_size - BATCH_SIZE); + + const auto aligned_m_size = align_to(M_SIZE, align_size_m); + auto padding_size_m = static_cast(aligned_m_size - M_SIZE); + const auto aligned_k_size = align_to(K_SIZE, align_size_k); + auto padding_size_k = static_cast(aligned_k_size - K_SIZE); + const auto aligned_n_size = align_to(N_SIZE, align_size_n); + auto padding_size_n = static_cast(aligned_n_size - N_SIZE); + + ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE }; + ov::Shape in1_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_m_size, aligned_k_size }; + ov::Shape in2_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_k_size, aligned_n_size }; + + // Use dynamic padding for all BFYX dimensions + tensor dyn_pad_dims_input1({0, 0, 0, 0}, 0); + tensor dyn_pad_dims_input2({0, 0, 0, 0}, 0); + + if (n_dim_only) { + dyn_pad_dims_input1 = tensor({0, 0, 0, 0}, 0); + dyn_pad_dims_input2 = tensor({0, 0, 1, 0}, 0); + } else { + dyn_pad_dims_input1 = tensor({1, 1, 1, 1}, 0); + dyn_pad_dims_input2 = tensor({1, 1, 1, 1}, 0); + } + + auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1)}; + auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input2)}; + + auto aligned_input1_mem = engine.allocate_memory({ov::PartialShape(in1_shape_aligned), data_types::f16, format::bfyx}); + auto aligned_input2_mem = engine.allocate_memory({ov::PartialShape(in2_shape_aligned), data_types::f16, format::bfyx}); + + auto input1_mem = engine.reinterpret_buffer(*aligned_input1_mem, layout{ov::PartialShape(in1_shape), + data_types::f16, + format::bfyx, + n_dim_only ? padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1) : + padding({0, 0, 0, 0}, {padding_size_batch1, padding_size_batch2, padding_size_k, padding_size_m}, 0.0f, dyn_pad_dims_input1)}); + + auto input2_mem = engine.reinterpret_buffer(*aligned_input2_mem, layout{ov::PartialShape(in2_shape), + data_types::f16, + format::bfyx, + n_dim_only ? padding({0, 0, 0, 0}, {0, 0, padding_size_n, 0}, 0.0f, dyn_pad_dims_input2) : + padding({0, 0, 0, 0}, {padding_size_batch1, padding_size_batch2, padding_size_n, padding_size_k}, 0.0f, dyn_pad_dims_input2)}); + + auto input_1_data = rg.generate_random_1d(ov::shape_size(in1_shape), -2, 2); + auto input_2_data = rg.generate_random_1d(ov::shape_size(in2_shape), -2, 2); + + fill_mem(input1_mem, input_1_data); + fill_mem(input2_mem, input_2_data); + + auto get_ref_results = [&]() { + ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE }; + auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx}; + auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx}; + + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f16, format::bfyx}); + auto input2_mem = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f16, format::bfyx}); + + fill_mem(input1_mem, input_1_data); + fill_mem(input2_mem, input_2_data); + + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4), + permute("permute", input_info("gemm_ref"), {0, 2, 1, 3}), + reorder("reorder", input_info("permute"), format::bfyx, data_types::f32) + ); + + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, std::string(""), impl_types::onednn }; + ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(true) }; + + network network(engine, topology, cfg); + network.set_input_data("input1", input1_mem); + network.set_input_data("input2", input2_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "reorder"); + + auto inst = network.get_primitive("reorder"); + + auto output_mem = outputs.at("reorder").get_memory(); + auto output_layout = outputs.at("reorder").get_layout(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; - topology topology; - topology.add(input_layout("input0", input0->get_layout())); - topology.add(permute("permute0", input_info("input0"), {0, 3, 1, 2})); - topology.add(input_layout("input1", input1->get_layout())); - topology.add(permute("permute1", input_info("input1"), {0, 3, 1, 2})); - topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true)); + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4), + permute("permute", input_info("gemm"), {0, 2, 1, 3}), + reorder("reorder", input_info("permute"), format::bfyx, data_types::f32) + ); + + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, std::string(""), impl_types::onednn }; + ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(true) }; + network network(engine, topology, cfg); + network.set_input_data("input1", input1_mem); + network.set_input_data("input2", input2_mem); - ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn }; - ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order), - ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }), - ov::intel_gpu::optimize_data(true), - ov::intel_gpu::allow_new_shape_infer(false) }; - network network(engine, topology, config); + auto outputs = network.execute(); - auto input0_data = rg.generate_random_1d(input0->get_layout().count(), -1, 1); - auto input1_data = rg.generate_random_1d(input1->get_layout().count(), -1, 1); + auto output_mem = outputs.at("reorder").get_memory(); + auto output_layout = outputs.at("reorder").get_layout(); - set_values(input0, input0_data); - set_values(input1, input1_data); + auto res = engine.reinterpret_buffer(*output_mem, output_layout); - network.set_input_data("input0", input0); - network.set_input_data("input1", input1); + auto ref_res = get_ref_results(); + + mem_lock res_lock(res, get_test_stream()); + mem_lock res_ref_lock(ref_res, get_test_stream()); + for (size_t i = 0; i < res->count(); i++) { + ASSERT_EQ(res_lock[i], res_ref_lock[i]) << i; + } + } +}; + +TEST_F(gemm_onednn, impl_replacement_with_cldnn) { + this->test_impl_replacement_with_cldnn(); +} + +// Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy) +TEST_F(gemm_onednn, check_transpose_format_byfx) { + this->test_check_transpose_format({0, 2, 1, 3}); +} + +TEST_F(gemm_onednn, check_transpose_format_bxfy) { + this->test_check_transpose_format({0, 3, 1, 2}); +} + +TEST_F(gemm_onednn, dynamic_padding_all_dim) { + this->test_dynamic_padding(false); +} - ASSERT_NO_FATAL_FAILURE(network.execute()); +TEST_F(gemm_onednn, dynamic_padding_n_dim_only) { + this->test_dynamic_padding(true); } template