Merge pull request #2 from fwyzard/implement_cupla

Add rawtodigi implementation using the Cupla backend
makortel · Apr 12, 2019 · 703ec17 · 703ec17
2 parents 2d45e1e + ea0d813
commit 703ec17
Show file tree

Hide file tree

Showing 9 changed files with 732 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 *~
 main-*
+debug-*
diff --git a/GPUSimpleVector.h b/GPUSimpleVector.h
@@ -6,7 +6,11 @@
 #include <type_traits>
 #include <utility>
 
+#if defined DIGI_CUDA
 #include <cuda.h>
+#elif defined DIGI_CUPLA
+#include <cuda_to_cupla.hpp>
+#endif
 
 namespace GPU {
 template <class T> struct SimpleVector {
@@ -51,7 +55,35 @@ template <class T> struct SimpleVector {
       return T(); //undefined behaviour
   }
 
-#ifdef __CUDACC__
+#ifdef DIGI_CUPLA
+
+  template <typename T_Acc>
+  ALPAKA_FN_ACC
+  int push_back(T_Acc const& acc, const T &element) {
+    auto previousSize = atomicAdd(&m_size, 1);
+    if (previousSize < m_capacity) {
+      m_data[previousSize] = element;
+      return previousSize;
+    } else {
+      atomicSub(&m_size, 1);
+      return -1;
+    }
+  }
+
+  template <typename T_Acc, class... Ts>
+  ALPAKA_FN_ACC
+  int emplace_back(T_Acc const& acc, Ts &&... args) {
+    auto previousSize = atomicAdd(&m_size, 1);
+    if (previousSize < m_capacity) {
+      (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+      return previousSize;
+    } else {
+      atomicSub(&m_size, 1);
+      return -1;
+    }
+  }
+
+#elif defined __CUDACC__
 
   // thread-safe version of the vector, when used in a CUDA kernel
   __device__
@@ -80,6 +112,7 @@ template <class T> struct SimpleVector {
   }
 
 #endif // __CUDACC__
+
   inline constexpr bool empty() const { return m_size==0;}
   inline constexpr bool full() const { return m_size==m_capacity;}
   inline constexpr T& operator[](int i) { return m_data[i]; }

diff --git a/Makefile b/Makefile
@@ -1,19 +1,60 @@
-CXXFLAGS := -O2 -std=c++17
-CXX := g++
+TARGETS = naive cuda cupla-cuda cupla-serial
 
-CUDAFLAGS := -O2 -std=c++14 --expt-relaxed-constexpr
-NVCC:= nvcc
+.PHONY: default all debug clean $(TARGETS)
+
+CXX := g++-7
+CXX_FLAGS := -O2 -std=c++14 -ftemplate-depth-512
+CXX_DEBUG := -g
+
+NVCC := /usr/local/cuda-10.0/bin/nvcc -ccbin $(CXX)
+NVCC_FLAGS := -O2 -std=c++14 --expt-relaxed-constexpr
+NVCC_DEBUG := -g -lineinfo
+
+ALPAKA_BASE := $(HOME)/src/alpaka/alpaka
+CUPLA_BASE  := $(HOME)/src/alpaka/cupla
+CUPLA_FLAGS := -DALPAKA_DEBUG=0 -DCUPLA_STREAM_ASYNC_ENABLED=1 -I$(ALPAKA_BASE)/include -I$(CUPLA_BASE)/include -L$(CUPLA_BASE)/lib
 
 default: naive
 
-naive:
-	$(CXX) $(CXXFLAGS) -DDIGI_NAIVE -o main-naive main.cc
+all: $(TARGETS)
+
+debug: $(TARGETS:%=debug-%)
+
+clean:
+	rm -f $(TARGETS:%=main-%) $(TARGETS:%=debug-%)
+
+# Naive CPU implementation
+naive: main-naive
+
+main-naive: main.cc rawtodigi_naive.h
+	$(CXX) $(CXX_FLAGS) -DDIGI_NAIVE -o main-naive main.cc
+
+debug-naive: main.cc rawtodigi_naive.h
+	$(CXX) $(CXX_FLAGS) -DDIGI_NAIVE $(CXX_DEBUG) -o debug-naive main.cc
+
+# CUDA implementation
+cuda: main-cuda
+
+main-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h
+	$(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu
+
+debug-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h
+	$(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA $(NVCC_DEBUG) -o debug-cuda main.cc rawtodigi_cuda.cu
+
+# Alpaka/cupla implementation, with CUDA backend
+cupla-cuda: main-cupla-cuda
+
+main-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda -o main-cupla-cuda main.cc rawtodigi_cupla.cc
+
+debug-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda $(NVCC_DEBUG) -o debug-cupla-cuda main.cc rawtodigi_cupla.cc
 
-debug-naive:
-	$(CXX) $(CXXFLAGS) -DDIGI_NAIVE -g -o main-naive main.cc
+# Alpaka/cupla implementation, with serial cpu backend
+cupla-serial: main-cupla-serial
 
-cuda:
-	$(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu
+main-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread -o main-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial
 
-debug-cuda:
-	$(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -g -G -o main-cuda main.cc rawtodigi_cuda.cu
+debug-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread $(CXX_DEBUG) -o debug-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial
diff --git a/README.md b/README.md
@@ -5,10 +5,12 @@ The purpose of this test program is to experiment with various
 
 ## Current implementations
 
-| Implementation | Execuable    | `make` target | `#ifdef` macro |
-|----------------|--------------|---------------|----------------|
-| Naive CPU      | `main-naive` |`naive`        | `DIGI_NAIVE`   |
-| CUDA           | `main-cuda`  |`cuda`         | `DIGI_CUDA`    |
+| Implementation | Execuable            | `make` target | `#ifdef` macro |
+|----------------|----------------------|---------------|----------------|
+| Naive CPU      | `main-naive`         |`naive`        | `DIGI_NAIVE`   |
+| CUDA           | `main-cuda`          |`cuda`         | `DIGI_CUDA`    |
+| Cupla on CPU   | `main-cupla-serial`  |`cupla-serial` | `DIGI_CUPLA`   |
+| Cupla on GPU   | `main-cupla-cuda`    |`cupla-cuda`   | `DIGI_CUPLA`   |
 
 ### Naive CPU
 
@@ -20,6 +22,37 @@ The CUDA test program requires a recent CUDA version (`nvcc`
 supporting C++14 and `--expt-relaxed-constexpr`) and a machine with
 GPU.
 
+### Cupla
+
+The Cupla test program can be compiled for different backends; so far it has
+been tested with the CUDA backend (`-DALPAKA_ACC_GPU_CUDA_ENABLED`) and the
+serial CPU backend (`-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED`). The CUDA backend
+requires CUDA 9.2 or CUDA 10.0, and has been tested with gcc 8.
+
+In fact, the Cupla libraries need to be built before they can be used:
+```bash
+export ALPAKA_ROOT=$HOME/src/alpaka/alpaka
+export CUPLA_ROOT=$HOME/src/alpaka/cupla
+
+git clone git@github.com:ComputationalRadiationPhysics/alpaka.git -b 0.3.5 $ALPAKA_ROOT
+git clone git@github.com:ComputationalRadiationPhysics/cupla.git  -b 0.1.1 $CUPLA_ROOT
+
+mkdir -p $CUPLA_ROOT/build $CUPLA_ROOT/lib
+cd $CUPLA_ROOT/build
+FILES="$CUPLA_ROOT/src/*.cpp $CUPLA_ROOT/src/manager/*.cpp"
+
+for FILE in $FILES; do
+  $NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -x cu -c $FILE -o cuda_$(basename $FILE).o
+done
+$NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -shared cuda_*.o -o $CUPLA_ROOT/lib/libcupla-cuda.so
+
+for FILE in $FILES; do
+  $CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -c $FILE -o serial_$(basename $FILE).o
+done
+$CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -shared serial_*.o -o $CUPLA_ROOT/lib/libcupla-serial.so
+
+```
+
 ## How to add a new implementation?
 
 - Copy of (e.g.) the `rawtodigi_naive.h` for the new *thing* (with name of the *thing* after the underscore)

diff --git a/cuplaCheck.h b/cuplaCheck.h
@@ -0,0 +1,30 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cuplaCheck_h
+#define HeterogeneousCore_CUDAUtilities_cuplaCheck_h
+
+#include <iostream>
+
+/* Do NOT include other headers that use CUDA runtime functions or variables
+ * (see above) before this include.
+ * The reason for this is that cupla renames CUDA host functions and device build in
+ * variables by using macros and macro functions.
+ * Do NOT include other specific includes such as `<cuda.h>` (driver functions,
+ * etc.).
+ */
+#include <cuda_to_cupla.hpp>
+
+inline
+bool cuplaCheck_(const char* file, int line, const char* cmd, cuplaError_t result)
+{
+  //std::cerr << file << ", line " << line << ": " << cmd << std::endl;
+  if (result == cuplaSuccess)
+    return true;
+
+  const char* message = cuplaGetErrorString(result);
+  std::cerr << file << ", line " << line << ": " << message << std::endl;
+  abort();
+  return false;
+}
+
+#define cuplaCheck(ARG) (cuplaCheck_(__FILE__, __LINE__, #ARG, (ARG)))
+
+#endif // HeterogeneousCore_CUDAUtilities_cuplaCheck_h
diff --git a/main.cc b/main.cc
@@ -12,6 +12,10 @@
 #include <cuda_runtime.h>
 #include "rawtodigi_cuda.h"
 #include "cudaCheck.h"
+#elif defined DIGI_CUPLA
+#include <cuda_to_cupla.hpp>
+#include "rawtodigi_cupla.h"
+#include "cuplaCheck.h"
 #endif
 
 namespace {
@@ -46,6 +50,9 @@ int main() {
 #ifdef DIGI_CUDA
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+#elif defined DIGI_CUPLA
+  cuplaStream_t stream;
+  cuplaStreamCreate(&stream);
 #endif
 
   int totaltime = 0;
@@ -100,6 +107,37 @@ int main() {
     cudaFree(input_d);
     cudaFreeHost(output_h);
     cudaFreeHost(input_h);
+#elif defined DIGI_CUPLA
+    Input *input_d, *input_h;
+    cuplaCheck(cuplaMalloc((void **) &input_d, sizeof(Input)));
+    cuplaCheck(cuplaMallocHost((void **) &input_h, sizeof(Input)));
+    std::memcpy(input_h, &input, sizeof(Input));
+
+    Output *output_d, *output_h;
+    cuplaCheck(cuplaMalloc((void **) &output_d, sizeof(Output)));
+    cuplaCheck(cuplaMallocHost((void **) &output_h, sizeof(Output)));
+    output_h->err.construct(pixelgpudetails::MAX_FED_WORDS, output_d->err_d);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    cuplaCheck(cuplaMemcpyAsync(input_d, input_h, sizeof(Input), cuplaMemcpyHostToDevice, stream));
+    cuplaCheck(cuplaMemcpyAsync(output_d, output_h, sizeof(Output), cuplaMemcpyHostToDevice, stream));
+
+    cupla::rawtodigi(input_d, output_d,
+                    input.wordCounter,
+                    true, true, false, stream);
+
+    cuplaCheck(cuplaMemcpyAsync(output_h, output_d, sizeof(Output), cuplaMemcpyDeviceToHost, stream));
+    cuplaCheck(cuplaStreamSynchronize(stream));
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    output_h->err.set_data(output_h->err_d);
+    std::memcpy(output.get(), output_h, sizeof(Output));
+    output->err.set_data(output->err_d);
+
+    cuplaFree(output_d);
+    cuplaFree(input_d);
+    cuplaFreeHost(output_h);
+    cuplaFreeHost(input_h);
 #endif
 
     auto diff = stop - start;
@@ -113,6 +151,8 @@ int main() {
 
 #ifdef DIGI_CUDA
   cudaStreamDestroy(stream);
+#elif defined DIGI_CUPLA
+  cuplaStreamDestroy(stream);
 #endif
 
   return 0;

diff --git a/output.h b/output.h
@@ -5,7 +5,7 @@
 
 #include "pixelgpudetails.h"
 
-#ifdef DIGI_CUDA
+#if defined DIGI_CUDA || DIGI_CUPLA
 #include "GPUSimpleVector.h"
 #endif
 
@@ -20,7 +20,7 @@ struct alignas(128) Output {
 
 #ifdef DIGI_NAIVE
   std::vector<PixelErrorCompact> err;
-#elif defined DIGI_CUDA
+#elif defined DIGI_CUDA || defined DIGI_CUPLA
   PixelErrorCompact err_d[pixelgpudetails::MAX_FED_WORDS];
   GPU::SimpleVector<PixelErrorCompact> err;
 #endif