taichi-dev · taichi-gardener · Jul 11, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
@@ -55,8 +55,8 @@ Ndarray::Ndarray(Program *prog,
         "Ndarray index might be out of int32 boundary but int64 indexing is "
         "not supported yet.");
   }
-  ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
-                                                 prog->result_buffer);
+  ndarray_alloc_ = prog->allocate_memory_on_device(nelement_ * element_size_,
+                                                   prog->result_buffer);
 }
 
 Ndarray::Ndarray(DeviceAllocation &devalloc,

diff --git a/taichi/program/program.h b/taichi/program/program.h
@@ -241,9 +241,9 @@ class TI_DLL_EXPORT Program {
   }
 
   // TODO: do we still need result_buffer?
-  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
-                                           uint64 *result_buffer) {
-    return program_impl_->allocate_memory_ndarray(alloc_size, result_buffer);
+  DeviceAllocation allocate_memory_on_device(std::size_t alloc_size,
+                                             uint64 *result_buffer) {
+    return program_impl_->allocate_memory_on_device(alloc_size, result_buffer);
   }
   DeviceAllocation allocate_texture(const ImageParams &params) {
     return program_impl_->allocate_texture(params);

diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
@@ -95,8 +95,8 @@ class ProgramImpl {
     return kDeviceNullPtr;
   }
 
-  virtual DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
-                                                   uint64 *result_buffer) {
+  virtual DeviceAllocation allocate_memory_on_device(std::size_t alloc_size,
+                                                     uint64 *result_buffer) {
     return kDeviceNullAllocation;
   }
 

diff --git a/taichi/runtime/amdgpu/kernel_launcher.cpp b/taichi/runtime/amdgpu/kernel_launcher.cpp
@@ -56,7 +56,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         if (on_amdgpu_device(data_ptr)) {
           device_ptrs[data_ptr_idx] = data_ptr;
         } else {
-          DeviceAllocation devalloc = executor->allocate_memory_ndarray(
+          DeviceAllocation devalloc = executor->allocate_memory_on_device(
               arr_sz, (uint64 *)device_result_buffer);
           device_ptrs[data_ptr_idx] =
               executor->get_ndarray_alloc_info_ptr(devalloc);
@@ -128,7 +128,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       AMDGPUDriver::get_instance().memcpy_device_to_host(
           itr->second.first, (void *)device_ptrs[idx],
           ctx.array_runtime_sizes[arg_id]);
-      executor->deallocate_memory_ndarray(itr->second.second);
+      executor->deallocate_memory_on_device(itr->second.second);
     }
   }
 }

diff --git a/taichi/runtime/cuda/kernel_launcher.cpp b/taichi/runtime/cuda/kernel_launcher.cpp
@@ -80,7 +80,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;
         } else {
-          DeviceAllocation devalloc = executor->allocate_memory_ndarray(
+          DeviceAllocation devalloc = executor->allocate_memory_on_device(
               arr_sz, (uint64 *)device_result_buffer);
           device_ptrs[data_ptr_idx] =
               executor->get_ndarray_alloc_info_ptr(devalloc);
@@ -89,8 +89,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           CUDADriver::get_instance().memcpy_host_to_device(
               (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz);
           if (grad_ptr != nullptr) {
-            DeviceAllocation grad_devalloc = executor->allocate_memory_ndarray(
-                arr_sz, (uint64 *)device_result_buffer);
+            DeviceAllocation grad_devalloc =
+                executor->allocate_memory_on_device(
+                    arr_sz, (uint64 *)device_result_buffer);
             device_ptrs[grad_ptr_idx] =
                 executor->get_ndarray_alloc_info_ptr(grad_devalloc);
             transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc};
@@ -165,7 +166,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       CUDADriver::get_instance().memcpy_device_to_host(
           itr->second.first, (void *)device_ptrs[idx],
           ctx.array_runtime_sizes[arg_id]);
-      executor->deallocate_memory_ndarray(itr->second.second);
+      executor->deallocate_memory_on_device(itr->second.second);
     }
   }
 }

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -473,7 +473,7 @@ LlvmDevice *LlvmRuntimeExecutor::llvm_device() {
   return static_cast<LlvmDevice *>(device_.get());
 }
 
-DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
+DeviceAllocation LlvmRuntimeExecutor::allocate_memory_on_device(
     std::size_t alloc_size,
     uint64 *result_buffer) {
   auto devalloc = llvm_device()->allocate_memory_runtime(
@@ -490,7 +490,7 @@ DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
   return devalloc;
 }
 
-void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
+void LlvmRuntimeExecutor::deallocate_memory_on_device(DeviceAllocation handle) {
   TI_ASSERT(allocated_runtime_memory_allocs_.find(handle.alloc_id) !=
             allocated_runtime_memory_allocs_.end());
   llvm_device()->dealloc_memory(handle);
@@ -562,7 +562,7 @@ void LlvmRuntimeExecutor::finalize() {
       if (ptr == nullptr)
         continue;
 
-      deallocate_memory_ndarray(iter.second);
+      deallocate_memory_on_device(iter.second);
     }
     allocated_runtime_memory_allocs_.clear();
 

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -47,11 +47,11 @@ class LlvmRuntimeExecutor {
       const LlvmOfflineCache::FieldCacheData &field_cache_data,
       uint64 *result_buffer);
 
-  // Ndarray Allocation
-  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
-                                           uint64 *result_buffer);
+  // Ndarray and ArgPack Allocation
+  DeviceAllocation allocate_memory_on_device(std::size_t alloc_size,
+                                             uint64 *result_buffer);
 
-  void deallocate_memory_ndarray(DeviceAllocation handle);
+  void deallocate_memory_on_device(DeviceAllocation handle);
 
   void check_runtime_error(uint64 *result_buffer);
 

diff --git a/taichi/runtime/llvm/snode_tree_buffer_manager.cpp b/taichi/runtime/llvm/snode_tree_buffer_manager.cpp
@@ -12,14 +12,14 @@ SNodeTreeBufferManager::SNodeTreeBufferManager(
 Ptr SNodeTreeBufferManager::allocate(std::size_t size,
                                      const int snode_tree_id,
                                      uint64 *result_buffer) {
-  auto devalloc = runtime_exec_->allocate_memory_ndarray(size, result_buffer);
+  auto devalloc = runtime_exec_->allocate_memory_on_device(size, result_buffer);
   snode_tree_id_to_device_alloc_[snode_tree_id] = devalloc;
   return (Ptr)runtime_exec_->get_ndarray_alloc_info_ptr(devalloc);
 }
 
 void SNodeTreeBufferManager::destroy(SNodeTree *snode_tree) {
   auto devalloc = snode_tree_id_to_device_alloc_[snode_tree->id()];
-  runtime_exec_->deallocate_memory_ndarray(devalloc);
+  runtime_exec_->deallocate_memory_on_device(devalloc);
   snode_tree_id_to_device_alloc_.erase(snode_tree->id());
 }
 

diff --git a/taichi/runtime/program_impls/gfx/gfx_program.cpp b/taichi/runtime/program_impls/gfx/gfx_program.cpp
@@ -40,7 +40,7 @@ std::unique_ptr<AotModuleBuilder> GfxProgramImpl::make_aot_module_builder(
   }
 }
 
-DeviceAllocation GfxProgramImpl::allocate_memory_ndarray(
+DeviceAllocation GfxProgramImpl::allocate_memory_on_device(
     std::size_t alloc_size,
     uint64 *result_buffer) {
   DeviceAllocation alloc;
@@ -51,6 +51,7 @@ DeviceAllocation GfxProgramImpl::allocate_memory_ndarray(
   TI_ASSERT(res == RhiResult::success);
   return alloc;
 }
+
 DeviceAllocation GfxProgramImpl::allocate_texture(const ImageParams &params) {
   return runtime_->create_image(params);
 }

diff --git a/taichi/runtime/program_impls/gfx/gfx_program.h b/taichi/runtime/program_impls/gfx/gfx_program.h
@@ -41,8 +41,8 @@ class GfxProgramImpl : public ProgramImpl {
     snode_tree_mgr_->destroy_snode_tree(snode_tree);
   }
 
-  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
-                                           uint64 *result_buffer) override;
+  DeviceAllocation allocate_memory_on_device(std::size_t alloc_size,
+                                             uint64 *result_buffer) override;
 
   bool used_in_kernel(DeviceAllocationId id) override {
     return runtime_->used_in_kernel(id);

diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h
@@ -121,9 +121,9 @@ class LlvmProgramImpl : public ProgramImpl {
     return runtime_exec_->fill_ndarray(alloc, size, data);
   }
 
-  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
-                                           uint64 *result_buffer) override {
-    return runtime_exec_->allocate_memory_ndarray(alloc_size, result_buffer);
+  DeviceAllocation allocate_memory_on_device(std::size_t alloc_size,
+                                             uint64 *result_buffer) override {
+    return runtime_exec_->allocate_memory_on_device(alloc_size, result_buffer);
   }
 
   Device *get_compute_device() override {

diff --git a/tests/cpp/aot/llvm/graph_aot_test.cpp b/tests/cpp/aot/llvm/graph_aot_test.cpp
@@ -46,9 +46,9 @@ TEST(LlvmCGraph, RunGraphCpu) {
   constexpr int ArrLength = 100;
   constexpr int kArrBytes_arr = ArrLength * 1 * sizeof(int32_t);
   auto devalloc_arr_0 =
-      exec.allocate_memory_ndarray(kArrBytes_arr, result_buffer);
+      exec.allocate_memory_on_device(kArrBytes_arr, result_buffer);
   auto devalloc_arr_1 =
-      exec.allocate_memory_ndarray(kArrBytes_arr, result_buffer);
+      exec.allocate_memory_on_device(kArrBytes_arr, result_buffer);
 
   /* Test with Graph */
   // Prepare & Run "init" Graph
@@ -115,10 +115,10 @@ TEST(LlvmCGraph, RunGraphCuda) {
     constexpr int ArrLength = 100;
     constexpr int kArrBytes_arr = ArrLength * 1 * sizeof(int32_t);
     auto devalloc_arr_0 =
-        exec.allocate_memory_ndarray(kArrBytes_arr, result_buffer);
+        exec.allocate_memory_on_device(kArrBytes_arr, result_buffer);
 
     auto devalloc_arr_1 =
-        exec.allocate_memory_ndarray(kArrBytes_arr, result_buffer);
+        exec.allocate_memory_on_device(kArrBytes_arr, result_buffer);
 
     /* Test with Graph */
     // Prepare & Run "init" Graph

diff --git a/tests/cpp/aot/llvm/kernel_aot_test.cpp b/tests/cpp/aot/llvm/kernel_aot_test.cpp
@@ -32,7 +32,7 @@ TEST(LlvmAotTest, CpuKernel) {
 
   constexpr int kArrLen = 32;
   constexpr int kArrBytes = kArrLen * sizeof(int32_t);
-  auto arr_devalloc = exec.allocate_memory_ndarray(kArrBytes, result_buffer);
+  auto arr_devalloc = exec.allocate_memory_on_device(kArrBytes, result_buffer);
   Ndarray arr = Ndarray(arr_devalloc, PrimitiveType::i32, {kArrLen});
 
   LLVM::AotModuleParams aot_params;
@@ -80,7 +80,8 @@ TEST(LlvmAotTest, CudaKernel) {
 
     constexpr int kArrLen = 32;
     constexpr int kArrBytes = kArrLen * sizeof(int32_t);
-    auto arr_devalloc = exec.allocate_memory_ndarray(kArrBytes, result_buffer);
+    auto arr_devalloc =
+        exec.allocate_memory_on_device(kArrBytes, result_buffer);
     Ndarray arr = Ndarray(arr_devalloc, PrimitiveType::i32, {kArrLen});
 
     LLVM::AotModuleParams aot_params;

diff --git a/tests/cpp/aot/llvm/mpm88_graph_aot_test.cpp b/tests/cpp/aot/llvm/mpm88_graph_aot_test.cpp
@@ -51,17 +51,17 @@ TEST(LlvmCGraph, Mpm88Cpu) {
 
   /* Prepare arguments */
   constexpr int kArrBytes_x = NR_PARTICLES * 2 * sizeof(float);
-  auto devalloc_x = exec.allocate_memory_ndarray(kArrBytes_x, result_buffer);
+  auto devalloc_x = exec.allocate_memory_on_device(kArrBytes_x, result_buffer);
   auto x = taichi::lang::Ndarray(devalloc_x, taichi::lang::PrimitiveType::f32,
                                  {NR_PARTICLES}, {2});
 
   constexpr int kArrBytes_v = NR_PARTICLES * 2 * sizeof(float);
-  auto devalloc_v = exec.allocate_memory_ndarray(kArrBytes_v, result_buffer);
+  auto devalloc_v = exec.allocate_memory_on_device(kArrBytes_v, result_buffer);
   auto v = taichi::lang::Ndarray(devalloc_v, taichi::lang::PrimitiveType::f32,
                                  {NR_PARTICLES}, {2});
 
   constexpr int kArrBytes_J = NR_PARTICLES * sizeof(float);
-  auto devalloc_J = exec.allocate_memory_ndarray(kArrBytes_J, result_buffer);
+  auto devalloc_J = exec.allocate_memory_on_device(kArrBytes_J, result_buffer);
   auto J = taichi::lang::Ndarray(devalloc_J, taichi::lang::PrimitiveType::f32,
                                  {NR_PARTICLES});
 
@@ -78,24 +78,24 @@ TEST(LlvmCGraph, Mpm88Cpu) {
 
   constexpr int kArrBytes_grid_v = N_GRID * N_GRID * 2 * sizeof(float);
   auto devalloc_grid_v =
-      exec.allocate_memory_ndarray(kArrBytes_grid_v, result_buffer);
+      exec.allocate_memory_on_device(kArrBytes_grid_v, result_buffer);
   auto grid_v = taichi::lang::Ndarray(
       devalloc_grid_v, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID}, {2});
 
   constexpr int kArrBytes_grid_m = N_GRID * N_GRID * sizeof(float);
   auto devalloc_grid_m =
-      exec.allocate_memory_ndarray(kArrBytes_grid_m, result_buffer);
+      exec.allocate_memory_on_device(kArrBytes_grid_m, result_buffer);
   auto grid_m = taichi::lang::Ndarray(
       devalloc_grid_m, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID});
 
   constexpr int kArrBytes_pos = NR_PARTICLES * 3 * sizeof(float);
   auto devalloc_pos =
-      exec.allocate_memory_ndarray(kArrBytes_pos, result_buffer);
+      exec.allocate_memory_on_device(kArrBytes_pos, result_buffer);
   auto pos = taichi::lang::Ndarray(
       devalloc_pos, taichi::lang::PrimitiveType::f32, {NR_PARTICLES}, {3});
 
   constexpr int kArrBytes_C = NR_PARTICLES * sizeof(float) * 2 * 2;
-  auto devalloc_C = exec.allocate_memory_ndarray(kArrBytes_C, result_buffer);
+  auto devalloc_C = exec.allocate_memory_on_device(kArrBytes_C, result_buffer);
   auto C = taichi::lang::Ndarray(devalloc_C, taichi::lang::PrimitiveType::f32,
                                  {NR_PARTICLES}, {2, 2});
 
@@ -136,17 +136,20 @@ TEST(LlvmCGraph, Mpm88Cuda) {
 
     /* Prepare arguments */
     constexpr int kArrBytes_x = NR_PARTICLES * 2 * sizeof(float);
-    auto devalloc_x = exec.allocate_memory_ndarray(kArrBytes_x, result_buffer);
+    auto devalloc_x =
+        exec.allocate_memory_on_device(kArrBytes_x, result_buffer);
     auto x = taichi::lang::Ndarray(devalloc_x, taichi::lang::PrimitiveType::f32,
                                    {NR_PARTICLES}, {2});
 
     constexpr int kArrBytes_v = NR_PARTICLES * 2 * sizeof(float);
-    auto devalloc_v = exec.allocate_memory_ndarray(kArrBytes_v, result_buffer);
+    auto devalloc_v =
+        exec.allocate_memory_on_device(kArrBytes_v, result_buffer);
     auto v = taichi::lang::Ndarray(devalloc_v, taichi::lang::PrimitiveType::f32,
                                    {NR_PARTICLES}, {2});
 
     constexpr int kArrBytes_J = NR_PARTICLES * sizeof(float);
-    auto devalloc_J = exec.allocate_memory_ndarray(kArrBytes_J, result_buffer);
+    auto devalloc_J =
+        exec.allocate_memory_on_device(kArrBytes_J, result_buffer);
     auto J = taichi::lang::Ndarray(devalloc_J, taichi::lang::PrimitiveType::f32,
                                    {NR_PARTICLES});
 
@@ -163,25 +166,26 @@ TEST(LlvmCGraph, Mpm88Cuda) {
 
     constexpr int kArrBytes_grid_v = N_GRID * N_GRID * 2 * sizeof(float);
     auto devalloc_grid_v =
-        exec.allocate_memory_ndarray(kArrBytes_grid_v, result_buffer);
+        exec.allocate_memory_on_device(kArrBytes_grid_v, result_buffer);
     auto grid_v =
         taichi::lang::Ndarray(devalloc_grid_v, taichi::lang::PrimitiveType::f32,
                               {N_GRID, N_GRID}, {2});
 
     constexpr int kArrBytes_grid_m = N_GRID * N_GRID * sizeof(float);
     auto devalloc_grid_m =
-        exec.allocate_memory_ndarray(kArrBytes_grid_m, result_buffer);
+        exec.allocate_memory_on_device(kArrBytes_grid_m, result_buffer);
     auto grid_m = taichi::lang::Ndarray(
         devalloc_grid_m, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID});
 
     constexpr int kArrBytes_pos = NR_PARTICLES * 3 * sizeof(float);
     auto devalloc_pos =
-        exec.allocate_memory_ndarray(kArrBytes_pos, result_buffer);
+        exec.allocate_memory_on_device(kArrBytes_pos, result_buffer);
     auto pos = taichi::lang::Ndarray(
         devalloc_pos, taichi::lang::PrimitiveType::f32, {NR_PARTICLES}, {3});
 
     constexpr int kArrBytes_C = NR_PARTICLES * sizeof(float) * 2 * 2;
-    auto devalloc_C = exec.allocate_memory_ndarray(kArrBytes_C, result_buffer);
+    auto devalloc_C =
+        exec.allocate_memory_on_device(kArrBytes_C, result_buffer);
     auto C = taichi::lang::Ndarray(devalloc_C, taichi::lang::PrimitiveType::f32,
                                    {NR_PARTICLES}, {2, 2});