Skip to content

Commit

Permalink
Synchronize CUDA stream once in operator benchmark (NVIDIA#3525)
Browse files Browse the repository at this point in the history
* Synchronize CUDA stream once in operator benchmark

CUDA stream was synchronized after each iteration in operator benchmark,
which introduced an error to the measurements, especially for small data
and small batch sizes. In a real pipeline the synchronization would not
happen after each operation.

This commit moves the synchronization out of the loop, synchronizing the
stream only once in a benchmark.

Added sync_each_n parameter.

Signed-off-by: Szymon Karpiński <hugo@staszic.waw.pl>
  • Loading branch information
szkarpinski authored and cyyever committed Jan 23, 2022
1 parent efd37b5 commit a90e546
Showing 1 changed file with 18 additions and 10 deletions.
28 changes: 18 additions & 10 deletions dali/benchmark/operator_bench.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ class OperatorBench : public DALIBenchmark {
void RunGPU(benchmark::State &st, const OpSpec &op_spec, int batch_size = 128,
TensorListShape<> shape = uniform_list_shape(128, {1080, 1920, 3}),
TensorLayout layout = "HWC",
bool fill_in_data = false) {
bool fill_in_data = false,
int64_t sync_each_n = -1) {
assert(layout.size() == shape.size());

auto op_ptr = InstantiateOperator(op_spec);
Expand Down Expand Up @@ -117,29 +118,36 @@ class OperatorBench : public DALIBenchmark {
Setup<TensorList<GPUBackend>>(op_ptr, op_spec, ws, batch_size);
op_ptr->Run(ws);
CUDA_CALL(cudaStreamSynchronize(0));

int64_t batches = 0;
for (auto _ : st) {
op_ptr->Run(ws);
CUDA_CALL(cudaStreamSynchronize(0));

int num_batches = st.iterations() + 1;
st.counters["FPS"] = benchmark::Counter(batch_size * num_batches,
benchmark::Counter::kIsRate);
batches++;
if (sync_each_n > 0 && batches % sync_each_n == 0) {
CUDA_CALL(cudaStreamSynchronize(0));
}
}

st.ResumeTiming();
CUDA_CALL(cudaStreamSynchronize(0));
st.PauseTiming();
st.counters["FPS"] = benchmark::Counter(batch_size * st.iterations(),
benchmark::Counter::kIsRate);
}

template <typename T>
void RunGPU(benchmark::State &st, const OpSpec &op_spec, int batch_size = 128,
TensorShape<> shape = {1080, 1920, 3}, TensorLayout layout = "HWC",
bool fill_in_data = false) {
bool fill_in_data = false, int64_t sync_each_n = -1) {
RunGPU<T>(st, op_spec, batch_size,
uniform_list_shape(batch_size, shape), layout, fill_in_data);
uniform_list_shape(batch_size, shape), layout, fill_in_data, sync_each_n);
}

template <typename T>
void RunGPU(benchmark::State& st, const OpSpec &op_spec,
int batch_size = 128, int H = 1080, int W = 1920, int C = 3,
bool fill_in_data = false) {
RunGPU<T>(st, op_spec, batch_size, {H, W, C}, "HWC", fill_in_data);
bool fill_in_data = false, int64_t sync_each_n = -1) {
RunGPU<T>(st, op_spec, batch_size, {H, W, C}, "HWC", fill_in_data, sync_each_n);
}
};

Expand Down

0 comments on commit a90e546

Please sign in to comment.