From 48132ecf4108c8b070c865a87111f5b3b2271def Mon Sep 17 00:00:00 2001 From: Krzysztof Lecki Date: Mon, 20 Sep 2021 14:49:21 +0200 Subject: [PATCH] Fix bugs in C API and refactor tests (#3350) Fix `daliDeserializeDefault` - the batch size map was not allocated and when the pipeline handle was correctly cleaned, it tried to deallocate memory under invalid pointer. Add missing `daliDeletePipeline` in all C API tests so the tests no longer leak the created pipeline. Some tests (daliOutputCopySamples) used to fill the prefetch queue and schedule additional runs, whilst accessing only one output iteration. The not-freed pipeline could be actively using the memory when the memory resources were cleared on shutdown and the whole test process could crash during shutdown. Remove the additional unused iteration in such case. Rework tests to not use access to underlying contiguous buffer of TensorList. Signed-off-by: Krzysztof Lecki --- dali/c_api/c_api.cc | 20 +++-- dali/c_api/c_api_test.cc | 183 ++++++++++++++++++++++++--------------- include/dali/c_api.h | 2 +- 3 files changed, 126 insertions(+), 79 deletions(-) diff --git a/dali/c_api/c_api.cc b/dali/c_api/c_api.cc index 2796b6a908f..357a93fa585 100644 --- a/dali/c_api/c_api.cc +++ b/dali/c_api/c_api.cc @@ -41,7 +41,7 @@ bool dali_initialized = false; * Typically, this operator will be BatchSizeProvider. * Negative values denote max batch size (default state). * Typical usage: - * auto batch_sizes_map = reinterpret_cast(handle->batch_sizes_map); + * auto *batch_size_map = reinterpret_cast(handle->batch_size_map); */ using batch_size_map_t = std::unordered_map; @@ -80,7 +80,7 @@ void SetExternalInput(daliPipelineHandle *pipe_handle, const char *name, const v dali_data_type_t data_type, const int64_t *shapes, int sample_dim, const char *layout_str, cudaStream_t stream = 0, unsigned int flags = 0) { dali::Pipeline *pipeline = reinterpret_cast(pipe_handle->pipe); - auto bs_map = reinterpret_cast(pipe_handle->batch_sizes_map); + auto *bs_map = reinterpret_cast(pipe_handle->batch_size_map); auto curr_batch_size = PopCurrBatchSize(bs_map, pipeline->max_batch_size(), name); std::vector shapes_tmp(shapes, shapes + sample_dim * curr_batch_size); dali::TensorListShape<> tl_shape(std::move(shapes_tmp), curr_batch_size, sample_dim); @@ -111,7 +111,7 @@ void SetExternalInputTensors(daliPipelineHandle *pipe_handle, const char *name, const int64_t *shapes, int64_t sample_dim, const char *layout_str, cudaStream_t stream = 0, unsigned int flags = 0) { dali::Pipeline *pipeline = reinterpret_cast(pipe_handle->pipe); - auto bs_map = reinterpret_cast(pipe_handle->batch_sizes_map); + auto *bs_map = reinterpret_cast(pipe_handle->batch_size_map); auto curr_batch_size = PopCurrBatchSize(bs_map, pipeline->max_batch_size(), name); std::vector shapes_tmp(shapes, shapes + sample_dim * curr_batch_size); dali::TensorListShape<> tl_shape(std::move(shapes_tmp), curr_batch_size, sample_dim); @@ -177,12 +177,12 @@ void daliCreatePipeline(daliPipelineHandle *pipe_handle, const char *serialized_ if (pipeline->device_id() >= 0) { stream = dali::CUDAStream::Create(true); } + auto bs_map = std::make_unique(); + pipe_handle->ws = ws.release(); pipe_handle->copy_stream = stream.release(); pipe_handle->pipe = pipeline.release(); - - auto bs_map = std::make_unique(); - pipe_handle->batch_sizes_map = bs_map.release(); + pipe_handle->batch_size_map = bs_map.release(); } @@ -195,9 +195,11 @@ void daliDeserializeDefault(daliPipelineHandle *pipe_handle, const char *seriali stream = dali::CUDAStream::Create(true); } auto ws = std::make_unique(); + auto bs_map = std::make_unique(); pipe_handle->ws = ws.release(); pipe_handle->copy_stream = stream.release(); pipe_handle->pipe = pipeline.release(); + pipe_handle->batch_size_map = bs_map.release(); } @@ -225,7 +227,7 @@ void daliPrefetchSeparate(daliPipelineHandle *pipe_handle, void daliSetExternalInputBatchSize(daliPipelineHandle *pipe_handle, const char *name, int batch_size) { - auto *bs_map = reinterpret_cast(pipe_handle->batch_sizes_map); + auto *bs_map = reinterpret_cast(pipe_handle->batch_size_map); (*bs_map)[name] = batch_size; } @@ -541,7 +543,7 @@ void daliCopyTensorListNTo(daliPipelineHandle *pipe_handle, void *dst, int outpu void daliDeletePipeline(daliPipelineHandle* pipe_handle) { dali::Pipeline *pipeline = reinterpret_cast(pipe_handle->pipe); dali::DeviceWorkspace *ws = reinterpret_cast(pipe_handle->ws); - auto *bs_map = reinterpret_cast(pipe_handle->batch_sizes_map); + auto *bs_map = reinterpret_cast(pipe_handle->batch_size_map); DALI_ENFORCE(pipeline != nullptr && ws != nullptr, "Pipeline already deleted"); if (pipe_handle->copy_stream) { CUDA_CALL(cudaStreamDestroy(pipe_handle->copy_stream)); @@ -552,7 +554,7 @@ void daliDeletePipeline(daliPipelineHandle* pipe_handle) { delete bs_map; pipe_handle->ws = nullptr; pipe_handle->pipe = nullptr; - pipe_handle->batch_sizes_map = nullptr; + pipe_handle->batch_size_map = nullptr; } void daliLoadLibrary(const char* lib_path) { diff --git a/dali/c_api/c_api_test.cc b/dali/c_api/c_api_test.cc index 619d1a5bba6..644e7974750 100644 --- a/dali/c_api/c_api_test.cc +++ b/dali/c_api/c_api_test.cc @@ -19,6 +19,7 @@ #include #include "dali/c_api.h" +#include "dali/pipeline/data/buffer.h" #include "dali/pipeline/data/tensor_list.h" #include "dali/pipeline/data/views.h" #include "dali/pipeline/pipeline.h" @@ -135,7 +136,7 @@ std::unique_ptr GetExternalSourcePipeline(bool no_copy, const std::str } -// Takes Outptus from baseline and handle and compares them +// Takes Outputs from baseline and handle and compares them // Allows only for uint8_t CPU/GPU output data to be compared template void ComparePipelinesOutputs(daliPipelineHandle &handle, Pipeline &baseline, @@ -163,14 +164,17 @@ void ComparePipelinesOutputs(daliPipelineHandle &handle, Pipeline &baseline, // Unnecessary copy in case of CPUBackend, makes the code generic across Backends pipeline_output_cpu.Copy(ws.Output(0), cuda_stream); - TensorList c_api_output; - c_api_output.Resize(pipeline_output_cpu.shape(), DALI_UINT8); - daliOutputCopy(&handle, c_api_output.raw_mutable_data(), 0, + auto num_elems = pipeline_output_cpu.shape().num_elements(); + auto backend_buf = AllocBuffer(num_elems * sizeof(uint8_t), false); + auto cpu_buf = AllocBuffer(num_elems * sizeof(uint8_t), false); + daliOutputCopy(&handle, backend_buf.get(), 0, backend_to_device_type::value, 0, copy_output_flags); + // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - c_api_output_cpu.Copy(c_api_output, cuda_stream); + MemCopy(cpu_buf.get(), backend_buf.get(), num_elems * sizeof(uint8_t), cuda_stream); CUDA_CALL(cudaDeviceSynchronize()); - Check(view(pipeline_output_cpu), view(c_api_output_cpu)); + Check(view(pipeline_output_cpu), + TensorListView(cpu_buf.get(), pipeline_output_cpu.shape())); } } @@ -216,6 +220,8 @@ TYPED_TEST(CApiTest, GetOutputNameTest) { ASSERT_EQ(daliGetNumOutput(&handle), 2); EXPECT_STREQ(daliGetOutputName(&handle, 0), output0_name.c_str()); EXPECT_STREQ(daliGetOutputName(&handle, 1), output1_name.c_str()); + + daliDeletePipeline(&handle); } @@ -245,6 +251,7 @@ TYPED_TEST(CApiTest, FileReaderPipe) { pipe_ptr->RunGPU(); ComparePipelinesOutputs(handle, *pipe_ptr); + daliDeletePipeline(&handle); } TYPED_TEST(CApiTest, FileReaderDefaultPipe) { @@ -271,6 +278,7 @@ TYPED_TEST(CApiTest, FileReaderDefaultPipe) { pipe_ptr->RunGPU(); ComparePipelinesOutputs(handle, *pipe_ptr); + daliDeletePipeline(&handle); } @@ -278,9 +286,12 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocPipe) { TensorListShape<> input_shape = {{37, 23, 3}, {12, 22, 3}, {42, 42, 3}, {8, 8, 3}, {64, 32, 3}, {32, 64, 3}, {20, 20, 3}, {64, 64, 3}, {10, 10, 3}, {60, 50, 3}, {10, 15, 3}, {48, 48, 3}}; - TensorList input_cpu; - TensorList input; - input_cpu.Resize(input_shape, DALI_UINT8); + auto num_elems = input_shape.num_elements(); + + auto input_cpu = AllocBuffer(num_elems * sizeof(uint8_t), false); + auto input = AllocBuffer(num_elems * sizeof(uint8_t), false); + TensorList input_wrapper; + auto pipe_ptr = GetTestPipeline(false, this->output_device_); auto serialized = pipe_ptr->SerializeToProtobuf(); @@ -292,13 +303,15 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocPipe) { prefetch_queue_depth, false); for (int i = 0; i < prefetch_queue_depth; i++) { - SequentialFill(view(input_cpu), 42 * i); + SequentialFill(TensorListView(input_cpu.get(), input_shape), 42 * i); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); - pipe_ptr->SetExternalInput(input_name, input); + MemCopy(input.get(), input_cpu.get(), num_elems * sizeof(uint8_t), cuda_stream); + input_wrapper.ShareData(std::static_pointer_cast(input), num_elems * sizeof(uint8_t), + input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper); daliSetExternalInputBatchSize(&handle, input_name.c_str(), input_shape.num_samples()); daliSetExternalInputAsync(&handle, input_name.c_str(), backend_to_device_type::value, - input.raw_data(), dali_data_type_t::DALI_UINT8, input_shape.data(), + input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, cuda_stream, DALI_ext_default); } @@ -313,18 +326,22 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocPipe) { ComparePipelinesOutputs(handle, *pipe_ptr); } - SequentialFill(view(input_cpu), 42 * prefetch_queue_depth); + SequentialFill(TensorListView(input_cpu.get(), input_shape), + 42 * prefetch_queue_depth); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); - pipe_ptr->SetExternalInput(input_name, input); + MemCopy(input.get(), input_cpu.get(), num_elems * sizeof(uint8_t), cuda_stream); + input_wrapper.ShareData(std::static_pointer_cast(input), num_elems * sizeof(uint8_t), + input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper); daliSetExternalInputAsync(&handle, input_name.c_str(), backend_to_device_type::value, - input.raw_data(), dali_data_type_t::DALI_UINT8, input_shape.data(), + input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), "HWC", cuda_stream, DALI_ext_default); daliRun(&handle); pipe_ptr->RunCPU(); pipe_ptr->RunGPU(); ComparePipelinesOutputs(handle, *pipe_ptr); + daliDeletePipeline(&handle); } @@ -351,18 +368,22 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocVariableBatchSizePipe) { pipe_ptr = GetTestPipeline(false, this->output_device_); pipe_ptr->Build(); - TensorList input_cpu; - TensorList input; - input_cpu.Resize(input_shape, DALI_UINT8); + auto num_elems = input_shape.num_elements(); + + auto input_cpu = AllocBuffer(num_elems * sizeof(uint8_t), false); + auto input = AllocBuffer(num_elems * sizeof(uint8_t), false); + TensorList input_wrapper; for (int i = 0; i < prefetch_queue_depth; i++) { - SequentialFill(view(input_cpu), 42 * i); + SequentialFill(TensorListView(input_cpu.get(), input_shape), 42 * i); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); - pipe_ptr->SetExternalInput(input_name, input); + MemCopy(input.get(), input_cpu.get(), num_elems, cuda_stream); + input_wrapper.ShareData(std::static_pointer_cast(input), num_elems * sizeof(uint8_t), + input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper); daliSetExternalInputBatchSize(&handle, input_name.c_str(), input_shape.num_samples()); daliSetExternalInputAsync(&handle, input_name.c_str(), - backend_to_device_type::value, input.raw_data(), + backend_to_device_type::value, input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, cuda_stream, DALI_ext_default); } @@ -379,6 +400,7 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocVariableBatchSizePipe) { input_shape.num_samples()); } } + daliDeletePipeline(&handle); } @@ -439,6 +461,7 @@ TYPED_TEST(CApiTest, ExternalSourceMultipleAllocPipe) { pipe_ptr->RunGPU(); ComparePipelinesOutputs(handle, *pipe_ptr); + daliDeletePipeline(&handle); } @@ -451,9 +474,12 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocDifferentBackendsTest) { TensorListShape<> input_shape = {{37, 23, 3}, {12, 22, 3}, {42, 42, 3}, {8, 8, 3}, {64, 32, 3}, {32, 64, 3}, {20, 20, 3}, {64, 64, 3}, {10, 10, 3}, {60, 50, 3}, {10, 15, 3}, {48, 48, 3}}; - TensorList input_cpu; - TensorList input; - input_cpu.Resize(input_shape, DALI_UINT8); + auto num_elems = input_shape.num_elements(); + + auto input_cpu = AllocBuffer(num_elems * sizeof(uint8_t), false); + auto input = AllocBuffer(num_elems * sizeof(uint8_t), false); + TensorList input_wrapper; + auto pipe_ptr = GetTestPipeline(false, this->output_device_); auto serialized = pipe_ptr->SerializeToProtobuf(); @@ -465,13 +491,15 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocDifferentBackendsTest) { prefetch_queue_depth, false); for (int i = 0; i < prefetch_queue_depth; i++) { - SequentialFill(view(input_cpu), 42 * i); + SequentialFill(TensorListView(input_cpu.get(), input_shape), 42 * i); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); + MemCopy(input.get(), input_cpu.get(), num_elems, cuda_stream); CUDA_CALL(cudaStreamSynchronize(cuda_stream)); - pipe_ptr->SetExternalInput(input_name, input); + input_wrapper.ShareData(std::static_pointer_cast(input), num_elems * sizeof(uint8_t), + input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper); daliSetExternalInput(&handle, input_name.c_str(), backend_to_device_type::value, - input.raw_data(), dali_data_type_t::DALI_UINT8, input_shape.data(), + input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, DALI_ext_default); } @@ -486,19 +514,23 @@ TYPED_TEST(CApiTest, ExternalSourceSingleAllocDifferentBackendsTest) { ComparePipelinesOutputs(handle, *pipe_ptr); } - SequentialFill(view(input_cpu), 42 * prefetch_queue_depth); + SequentialFill(TensorListView(input_cpu.get(), input_shape), + 42 * prefetch_queue_depth); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); + MemCopy(input.get(), input_cpu.get(), num_elems, cuda_stream); CUDA_CALL(cudaStreamSynchronize(cuda_stream)); - pipe_ptr->SetExternalInput(input_name, input); + input_wrapper.ShareData(std::static_pointer_cast(input), num_elems * sizeof(uint8_t), + input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper); daliSetExternalInput(&handle, input_name.c_str(), backend_to_device_type::value, - input.raw_data(), dali_data_type_t::DALI_UINT8, input_shape.data(), - input_shape.sample_dim(), "HWC", DALI_ext_default); + input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), + input_shape.sample_dim(), "HWC", DALI_ext_default); daliRun(&handle); pipe_ptr->RunCPU(); pipe_ptr->RunGPU(); ComparePipelinesOutputs(handle, *pipe_ptr); + daliDeletePipeline(&handle); } @@ -565,6 +597,7 @@ TYPED_TEST(CApiTest, ExternalSourceMultipleAllocDifferentBackendsTest) { pipe_ptr->RunGPU(); ComparePipelinesOutputs(handle, *pipe_ptr); + daliDeletePipeline(&handle); } TYPED_TEST(CApiTest, TestExecutorMeta) { @@ -592,22 +625,24 @@ TYPED_TEST(CApiTest, TestExecutorMeta) { } } daliFreeExecutorMetadata(meta, N); + daliDeletePipeline(&handle); } TYPED_TEST(CApiTest, UseCopyKernel) { TensorListShape<> input_shape = {{37, 23, 3}, {12, 22, 3}, {42, 42, 3}, {8, 8, 3}, {64, 32, 3}, {32, 64, 3}, {20, 20, 3}, {64, 64, 3}, {10, 10, 3}, {60, 50, 3}, {10, 15, 3}, {48, 48, 3}}; - TensorList input_cpu; - input_cpu.Resize(input_shape, DALI_UINT8); - - TensorList input; - if (std::is_same::value) - input.set_pinned(true); + auto num_elems = input_shape.num_elements(); + auto input_cpu = AllocBuffer(num_elems * sizeof(uint8_t), false); + auto input = AllocBuffer(num_elems * sizeof(uint8_t), + std::is_same::value); + TensorList input_wrapper; + if (std::is_same::value) { + input_wrapper.set_pinned(true); + } auto pipe_ptr = GetTestPipeline(false, this->output_device_); auto serialized = pipe_ptr->SerializeToProtobuf(); - pipe_ptr->Build(); daliPipelineHandle handle; @@ -619,12 +654,14 @@ TYPED_TEST(CApiTest, UseCopyKernel) { if (std::is_same::value) flags |= DALI_ext_pinned; for (int i = 0; i < prefetch_queue_depth; i++) { - SequentialFill(view(input_cpu), 42 * i); + SequentialFill(TensorListView(input_cpu.get(), input_shape), 42 * i); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); - pipe_ptr->SetExternalInput(input_name, input); + MemCopy(input.get(), input_cpu.get(), num_elems, cuda_stream); + input_wrapper.ShareData(std::static_pointer_cast(input), num_elems * sizeof(uint8_t), + input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper); daliSetExternalInputAsync(&handle, input_name.c_str(), backend_to_device_type::value, - input.raw_data(), dali_data_type_t::DALI_UINT8, input_shape.data(), + input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, cuda_stream, flags); } @@ -638,6 +675,7 @@ TYPED_TEST(CApiTest, UseCopyKernel) { for (int i = 0; i < prefetch_queue_depth; i++) { ComparePipelinesOutputs(handle, *pipe_ptr, flags); } + daliDeletePipeline(&handle); } @@ -645,10 +683,10 @@ TYPED_TEST(CApiTest, ForceNoCopyFail) { TensorListShape<> input_shape = {{37, 23, 3}, {12, 22, 3}, {42, 42, 3}, {8, 8, 3}, {64, 32, 3}, {32, 64, 3}, {20, 20, 3}, {64, 64, 3}, {10, 10, 3}, {60, 50, 3}, {10, 15, 3}, {48, 48, 3}}; - TensorList input_cpu; - input_cpu.Resize(input_shape, DALI_UINT8); + auto num_elems = input_shape.num_elements(); - TensorList input; + auto input_cpu = AllocBuffer(num_elems * sizeof(uint8_t), false); + auto input = AllocBuffer(num_elems * sizeof(uint8_t), false); auto device = backend_to_device_type::value; std::string device_str = GetDeviceStr(device); @@ -659,25 +697,24 @@ TYPED_TEST(CApiTest, ForceNoCopyFail) { auto pipe_ptr = GetExternalSourcePipeline(false, other_device_str); auto serialized = pipe_ptr->SerializeToProtobuf(); - pipe_ptr->Build(); - daliPipelineHandle handle; daliCreatePipeline(&handle, serialized.c_str(), serialized.size(), batch_size, num_thread, device_id, false, prefetch_queue_depth, prefetch_queue_depth, prefetch_queue_depth, false); - SequentialFill(view(input_cpu), 42); - // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - input.Copy(input_cpu, cuda_stream); + SequentialFill(TensorListView(input_cpu.get(), input_shape), 42); + // Unnecessary copy in case of CPUBackend, makes the code generic across Backends + MemCopy(input.get(), input_cpu.get(), num_elems, cuda_stream); // Try to fill the pipeline placed on "other_device" with data placed on the current "device" // while forcing NO COPY. It's not allowed to do a no copy across backends and it should error // out. ASSERT_THROW(daliSetExternalInputAsync( &handle, input_name.c_str(), backend_to_device_type::value, - input.raw_data(), dali_data_type_t::DALI_UINT8, input_shape.data(), + input.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, cuda_stream, DALI_ext_force_no_copy), std::runtime_error); + daliDeletePipeline(&handle); } @@ -686,10 +723,9 @@ void TestForceFlagRun(bool ext_src_no_copy, unsigned int flag_to_test) { TensorListShape<> input_shape = {{37, 23, 3}, {12, 22, 3}, {42, 42, 3}, {8, 8, 3}, {64, 32, 3}, {32, 64, 3}, {20, 20, 3}, {64, 64, 3}, {10, 10, 3}, {60, 50, 3}, {10, 15, 3}, {48, 48, 3}}; - TensorList input_cpu; - input_cpu.Resize(input_shape, DALI_UINT8); + auto num_elems = input_shape.num_elements(); - TensorList input; + auto input_cpu = AllocBuffer(num_elems * sizeof(uint8_t), false); auto device = backend_to_device_type::value; std::string device_str = GetDeviceStr(device); @@ -704,26 +740,32 @@ void TestForceFlagRun(bool ext_src_no_copy, unsigned int flag_to_test) { device_id, false, prefetch_queue_depth, prefetch_queue_depth, prefetch_queue_depth, false); - std::vector> data; - data.resize(prefetch_queue_depth); + std::vector> data; + data.reserve(prefetch_queue_depth); + for (int i = 0; i < prefetch_queue_depth; i++) { + data.push_back(AllocBuffer(num_elems * sizeof(uint8_t), false)); + } + std::vector> input_wrapper(prefetch_queue_depth); for (int i = 0; i < prefetch_queue_depth; i++) { - SequentialFill(view(input_cpu), 42 * i); + SequentialFill(TensorListView(input_cpu.get(), input_shape), 42 * i); // Unnecessary copy in case of CPUBackend, makes the code generic across Backends - data[i].Copy(input_cpu, cuda_stream); - pipe_ptr->SetExternalInput(input_name, data[i]); + MemCopy(data[i].get(), input_cpu.get(), num_elems, cuda_stream); + input_wrapper[i].ShareData(std::static_pointer_cast(data[i]), + num_elems * sizeof(uint8_t), input_shape, DALI_UINT8); + pipe_ptr->SetExternalInput(input_name, input_wrapper[i]); if (flag_to_test == DALI_ext_force_no_copy) { // for no copy, we just pass the view to data daliSetExternalInputAsync(&handle, input_name.c_str(), - backend_to_device_type::value, data[i].raw_data(), + backend_to_device_type::value, data[i].get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, cuda_stream, flag_to_test); } else { - TensorList tmp_data; - tmp_data.Copy(data[i], cuda_stream); + auto tmp_data = AllocBuffer(num_elems * sizeof(uint8_t), false); + MemCopy(tmp_data.get(), data[i].get(), num_elems, cuda_stream); // We pass a temporary TensorList as input and force the copy daliSetExternalInputAsync(&handle, input_name.c_str(), - backend_to_device_type::value, tmp_data.raw_data(), + backend_to_device_type::value, tmp_data.get(), dali_data_type_t::DALI_UINT8, input_shape.data(), input_shape.sample_dim(), nullptr, cuda_stream, flag_to_test); } @@ -739,6 +781,7 @@ void TestForceFlagRun(bool ext_src_no_copy, unsigned int flag_to_test) { for (int i = 0; i < prefetch_queue_depth; i++) { ComparePipelinesOutputs(handle, *pipe_ptr); } + daliDeletePipeline(&handle); } @@ -772,7 +815,6 @@ TYPED_TEST(CApiTest, daliOutputCopySamples) { daliPipelineHandle handle; daliDeserializeDefault(&handle, serialized.c_str(), serialized.size()); - daliPrefetchUniform(&handle, prefetch_queue_depth); daliRun(&handle); daliOutput(&handle); @@ -875,6 +917,7 @@ TYPED_TEST(CApiTest, daliOutputCopySamples) { Check(view(output1_cpu), view(output2_cpu)); } } + daliDeletePipeline(&handle); } TYPED_TEST(CApiTest, CpuOnlyTest) { @@ -885,6 +928,7 @@ TYPED_TEST(CApiTest, CpuOnlyTest) { std::string ser = pipe.SerializeToProtobuf(); daliPipelineHandle handle; daliDeserializeDefault(&handle, ser.c_str(), ser.size()); + daliDeletePipeline(&handle); } TEST(CApiTest, GetBackendTest) { @@ -908,6 +952,7 @@ TEST(CApiTest, GetBackendTest) { EXPECT_EQ(daliGetOperatorBackend(&handle, es_cpu_name.c_str()), DALI_BACKEND_CPU); EXPECT_EQ(daliGetOperatorBackend(&handle, es_gpu_name.c_str()), DALI_BACKEND_GPU); EXPECT_EQ(daliGetOperatorBackend(&handle, cont_name.c_str()), DALI_BACKEND_MIXED); + daliDeletePipeline(&handle); } } // namespace dali diff --git a/include/dali/c_api.h b/include/dali/c_api.h index 870a21c1a25..b15aba7ac33 100644 --- a/include/dali/c_api.h +++ b/include/dali/c_api.h @@ -33,7 +33,7 @@ extern "C" { typedef struct { void *pipe; void *ws; - void *batch_sizes_map; /// @see batch_size_map_t + void *batch_size_map; /// @see batch_size_map_t cudaStream_t copy_stream; /// Stream to perform copy operations on } daliPipelineHandle;