dmlc · nv-dlasalle · May 17, 2023 · May 12, 2023 · May 13, 2023 · May 16, 2023
diff --git a/src/array/cuda/rowwise_sampling.cu b/src/array/cuda/rowwise_sampling.cu
@@ -287,13 +287,13 @@ COOMatrix _CSRRowWiseSamplingUniform(
   cudaEvent_t copyEvent;
   CUDA_CALL(cudaEventCreate(&copyEvent));
 
-  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
-  // wait on a cudaevent
-  IdType new_len;
+  auto new_len_tensor = NDArray::PinnedEmpty(
+      {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+
   // copy using the internal current stream
-  device->CopyDataFromTo(
-      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
-      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+  CUDA_CALL(cudaMemcpyAsync(
+      new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
+      cudaMemcpyDeviceToHost, stream));
   CUDA_CALL(cudaEventRecord(copyEvent, stream));
 
   const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
@@ -322,6 +322,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
   CUDA_CALL(cudaEventSynchronize(copyEvent));
   CUDA_CALL(cudaEventDestroy(copyEvent));
 
+  const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
   picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
   picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
   picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);

diff --git a/src/graph/transform/cuda/cuda_to_block.cu b/src/graph/transform/cuda/cuda_to_block.cu
@@ -165,6 +165,9 @@ struct CUDAIdsMapper {
             NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
       }
     }
+
+    cudaEvent_t copyEvent;
+    NDArray new_len_tensor;
     // Populate the mappings.
     if (generate_lhs_nodes) {
       int64_t* count_lhs_device = static_cast<int64_t*>(
@@ -174,13 +177,16 @@ struct CUDAIdsMapper {
           src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
           stream);
 
-      device->CopyDataFromTo(
-          count_lhs_device, 0, num_nodes_per_type.data(), 0,
-          sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx,
-          DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
-      device->StreamSync(ctx, stream);
+      CUDA_CALL(cudaEventCreate(&copyEvent));
+      new_len_tensor = NDArray::PinnedEmpty(
+          {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+          DGLContext{kDGLCPU, 0});
+      CUDA_CALL(cudaMemcpyAsync(
+          new_len_tensor->data, count_lhs_device,
+          sizeof(*num_nodes_per_type.data()) * num_ntypes,
+          cudaMemcpyDeviceToHost, stream));
+      CUDA_CALL(cudaEventRecord(copyEvent, stream));
 
-      // Wait for the node counts to finish transferring.
       device->FreeWorkspace(ctx, count_lhs_device);
     } else {
       maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
@@ -189,14 +195,23 @@ struct CUDAIdsMapper {
         num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
       }
     }
-    // Resize lhs nodes.
+    // Map node numberings from global to local, and build pointer for CSR.
+    auto ret = MapEdges(graph, edge_arrays, node_maps, stream);
+
     if (generate_lhs_nodes) {
+      // wait for the previous copy
+      CUDA_CALL(cudaEventSynchronize(copyEvent));
+      CUDA_CALL(cudaEventDestroy(copyEvent));
+
+      // Resize lhs nodes.
       for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        num_nodes_per_type[ntype] =
+            static_cast<int64_t*>(new_len_tensor->data)[ntype];
         lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
       }
     }
-    // Map node numberings from global to local, and build pointer for CSR.
-    return MapEdges(graph, edge_arrays, node_maps, stream);
+
+    return ret;
   }
 };