Skip to content

Commit

Permalink
Merge pull request #2 from fwyzard/implement_cupla
Browse files Browse the repository at this point in the history
Add rawtodigi implementation using the Cupla backend
  • Loading branch information
makortel committed Apr 12, 2019
2 parents 2d45e1e + ea0d813 commit 703ec17
Show file tree
Hide file tree
Showing 9 changed files with 732 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*~
main-*
debug-*
35 changes: 34 additions & 1 deletion GPUSimpleVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
#include <type_traits>
#include <utility>

#if defined DIGI_CUDA
#include <cuda.h>
#elif defined DIGI_CUPLA
#include <cuda_to_cupla.hpp>
#endif

namespace GPU {
template <class T> struct SimpleVector {
Expand Down Expand Up @@ -51,7 +55,35 @@ template <class T> struct SimpleVector {
return T(); //undefined behaviour
}

#ifdef __CUDACC__
#ifdef DIGI_CUPLA

template <typename T_Acc>
ALPAKA_FN_ACC
int push_back(T_Acc const& acc, const T &element) {
auto previousSize = atomicAdd(&m_size, 1);
if (previousSize < m_capacity) {
m_data[previousSize] = element;
return previousSize;
} else {
atomicSub(&m_size, 1);
return -1;
}
}

template <typename T_Acc, class... Ts>
ALPAKA_FN_ACC
int emplace_back(T_Acc const& acc, Ts &&... args) {
auto previousSize = atomicAdd(&m_size, 1);
if (previousSize < m_capacity) {
(new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
return previousSize;
} else {
atomicSub(&m_size, 1);
return -1;
}
}

#elif defined __CUDACC__

// thread-safe version of the vector, when used in a CUDA kernel
__device__
Expand Down Expand Up @@ -80,6 +112,7 @@ template <class T> struct SimpleVector {
}

#endif // __CUDACC__

inline constexpr bool empty() const { return m_size==0;}
inline constexpr bool full() const { return m_size==m_capacity;}
inline constexpr T& operator[](int i) { return m_data[i]; }
Expand Down
65 changes: 53 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,19 +1,60 @@
CXXFLAGS := -O2 -std=c++17
CXX := g++
TARGETS = naive cuda cupla-cuda cupla-serial

CUDAFLAGS := -O2 -std=c++14 --expt-relaxed-constexpr
NVCC:= nvcc
.PHONY: default all debug clean $(TARGETS)

CXX := g++-7
CXX_FLAGS := -O2 -std=c++14 -ftemplate-depth-512
CXX_DEBUG := -g

NVCC := /usr/local/cuda-10.0/bin/nvcc -ccbin $(CXX)
NVCC_FLAGS := -O2 -std=c++14 --expt-relaxed-constexpr
NVCC_DEBUG := -g -lineinfo

ALPAKA_BASE := $(HOME)/src/alpaka/alpaka
CUPLA_BASE := $(HOME)/src/alpaka/cupla
CUPLA_FLAGS := -DALPAKA_DEBUG=0 -DCUPLA_STREAM_ASYNC_ENABLED=1 -I$(ALPAKA_BASE)/include -I$(CUPLA_BASE)/include -L$(CUPLA_BASE)/lib

default: naive

naive:
$(CXX) $(CXXFLAGS) -DDIGI_NAIVE -o main-naive main.cc
all: $(TARGETS)

debug: $(TARGETS:%=debug-%)

clean:
rm -f $(TARGETS:%=main-%) $(TARGETS:%=debug-%)

# Naive CPU implementation
naive: main-naive

main-naive: main.cc rawtodigi_naive.h
$(CXX) $(CXX_FLAGS) -DDIGI_NAIVE -o main-naive main.cc

debug-naive: main.cc rawtodigi_naive.h
$(CXX) $(CXX_FLAGS) -DDIGI_NAIVE $(CXX_DEBUG) -o debug-naive main.cc

# CUDA implementation
cuda: main-cuda

main-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h
$(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu

debug-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h
$(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA $(NVCC_DEBUG) -o debug-cuda main.cc rawtodigi_cuda.cu

# Alpaka/cupla implementation, with CUDA backend
cupla-cuda: main-cupla-cuda

main-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
$(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda -o main-cupla-cuda main.cc rawtodigi_cupla.cc

debug-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
$(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda $(NVCC_DEBUG) -o debug-cupla-cuda main.cc rawtodigi_cupla.cc

debug-naive:
$(CXX) $(CXXFLAGS) -DDIGI_NAIVE -g -o main-naive main.cc
# Alpaka/cupla implementation, with serial cpu backend
cupla-serial: main-cupla-serial

cuda:
$(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu
main-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
$(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread -o main-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial

debug-cuda:
$(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -g -G -o main-cuda main.cc rawtodigi_cuda.cu
debug-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
$(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread $(CXX_DEBUG) -o debug-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial
41 changes: 37 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ The purpose of this test program is to experiment with various

## Current implementations

| Implementation | Execuable | `make` target | `#ifdef` macro |
|----------------|--------------|---------------|----------------|
| Naive CPU | `main-naive` |`naive` | `DIGI_NAIVE` |
| CUDA | `main-cuda` |`cuda` | `DIGI_CUDA` |
| Implementation | Execuable | `make` target | `#ifdef` macro |
|----------------|----------------------|---------------|----------------|
| Naive CPU | `main-naive` |`naive` | `DIGI_NAIVE` |
| CUDA | `main-cuda` |`cuda` | `DIGI_CUDA` |
| Cupla on CPU | `main-cupla-serial` |`cupla-serial` | `DIGI_CUPLA` |
| Cupla on GPU | `main-cupla-cuda` |`cupla-cuda` | `DIGI_CUPLA` |

### Naive CPU

Expand All @@ -20,6 +22,37 @@ The CUDA test program requires a recent CUDA version (`nvcc`
supporting C++14 and `--expt-relaxed-constexpr`) and a machine with
GPU.

### Cupla

The Cupla test program can be compiled for different backends; so far it has
been tested with the CUDA backend (`-DALPAKA_ACC_GPU_CUDA_ENABLED`) and the
serial CPU backend (`-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED`). The CUDA backend
requires CUDA 9.2 or CUDA 10.0, and has been tested with gcc 8.

In fact, the Cupla libraries need to be built before they can be used:
```bash
export ALPAKA_ROOT=$HOME/src/alpaka/alpaka
export CUPLA_ROOT=$HOME/src/alpaka/cupla

git clone git@github.com:ComputationalRadiationPhysics/alpaka.git -b 0.3.5 $ALPAKA_ROOT
git clone git@github.com:ComputationalRadiationPhysics/cupla.git -b 0.1.1 $CUPLA_ROOT

mkdir -p $CUPLA_ROOT/build $CUPLA_ROOT/lib
cd $CUPLA_ROOT/build
FILES="$CUPLA_ROOT/src/*.cpp $CUPLA_ROOT/src/manager/*.cpp"

for FILE in $FILES; do
$NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -x cu -c $FILE -o cuda_$(basename $FILE).o
done
$NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -shared cuda_*.o -o $CUPLA_ROOT/lib/libcupla-cuda.so

for FILE in $FILES; do
$CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -c $FILE -o serial_$(basename $FILE).o
done
$CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -shared serial_*.o -o $CUPLA_ROOT/lib/libcupla-serial.so

```

## How to add a new implementation?

- Copy of (e.g.) the `rawtodigi_naive.h` for the new *thing* (with name of the *thing* after the underscore)
Expand Down
30 changes: 30 additions & 0 deletions cuplaCheck.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#ifndef HeterogeneousCore_CUDAUtilities_cuplaCheck_h
#define HeterogeneousCore_CUDAUtilities_cuplaCheck_h

#include <iostream>

/* Do NOT include other headers that use CUDA runtime functions or variables
* (see above) before this include.
* The reason for this is that cupla renames CUDA host functions and device build in
* variables by using macros and macro functions.
* Do NOT include other specific includes such as `<cuda.h>` (driver functions,
* etc.).
*/
#include <cuda_to_cupla.hpp>

inline
bool cuplaCheck_(const char* file, int line, const char* cmd, cuplaError_t result)
{
//std::cerr << file << ", line " << line << ": " << cmd << std::endl;
if (result == cuplaSuccess)
return true;

const char* message = cuplaGetErrorString(result);
std::cerr << file << ", line " << line << ": " << message << std::endl;
abort();
return false;
}

#define cuplaCheck(ARG) (cuplaCheck_(__FILE__, __LINE__, #ARG, (ARG)))

#endif // HeterogeneousCore_CUDAUtilities_cuplaCheck_h
40 changes: 40 additions & 0 deletions main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
#include <cuda_runtime.h>
#include "rawtodigi_cuda.h"
#include "cudaCheck.h"
#elif defined DIGI_CUPLA
#include <cuda_to_cupla.hpp>
#include "rawtodigi_cupla.h"
#include "cuplaCheck.h"
#endif

namespace {
Expand Down Expand Up @@ -46,6 +50,9 @@ int main() {
#ifdef DIGI_CUDA
cudaStream_t stream;
cudaStreamCreate(&stream);
#elif defined DIGI_CUPLA
cuplaStream_t stream;
cuplaStreamCreate(&stream);
#endif

int totaltime = 0;
Expand Down Expand Up @@ -100,6 +107,37 @@ int main() {
cudaFree(input_d);
cudaFreeHost(output_h);
cudaFreeHost(input_h);
#elif defined DIGI_CUPLA
Input *input_d, *input_h;
cuplaCheck(cuplaMalloc((void **) &input_d, sizeof(Input)));
cuplaCheck(cuplaMallocHost((void **) &input_h, sizeof(Input)));
std::memcpy(input_h, &input, sizeof(Input));

Output *output_d, *output_h;
cuplaCheck(cuplaMalloc((void **) &output_d, sizeof(Output)));
cuplaCheck(cuplaMallocHost((void **) &output_h, sizeof(Output)));
output_h->err.construct(pixelgpudetails::MAX_FED_WORDS, output_d->err_d);

auto start = std::chrono::high_resolution_clock::now();
cuplaCheck(cuplaMemcpyAsync(input_d, input_h, sizeof(Input), cuplaMemcpyHostToDevice, stream));
cuplaCheck(cuplaMemcpyAsync(output_d, output_h, sizeof(Output), cuplaMemcpyHostToDevice, stream));

cupla::rawtodigi(input_d, output_d,
input.wordCounter,
true, true, false, stream);

cuplaCheck(cuplaMemcpyAsync(output_h, output_d, sizeof(Output), cuplaMemcpyDeviceToHost, stream));
cuplaCheck(cuplaStreamSynchronize(stream));
auto stop = std::chrono::high_resolution_clock::now();

output_h->err.set_data(output_h->err_d);
std::memcpy(output.get(), output_h, sizeof(Output));
output->err.set_data(output->err_d);

cuplaFree(output_d);
cuplaFree(input_d);
cuplaFreeHost(output_h);
cuplaFreeHost(input_h);
#endif

auto diff = stop - start;
Expand All @@ -113,6 +151,8 @@ int main() {

#ifdef DIGI_CUDA
cudaStreamDestroy(stream);
#elif defined DIGI_CUPLA
cuplaStreamDestroy(stream);
#endif

return 0;
Expand Down
4 changes: 2 additions & 2 deletions output.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include "pixelgpudetails.h"

#ifdef DIGI_CUDA
#if defined DIGI_CUDA || DIGI_CUPLA
#include "GPUSimpleVector.h"
#endif

Expand All @@ -20,7 +20,7 @@ struct alignas(128) Output {

#ifdef DIGI_NAIVE
std::vector<PixelErrorCompact> err;
#elif defined DIGI_CUDA
#elif defined DIGI_CUDA || defined DIGI_CUPLA
PixelErrorCompact err_d[pixelgpudetails::MAX_FED_WORDS];
GPU::SimpleVector<PixelErrorCompact> err;
#endif
Expand Down
Loading

0 comments on commit 703ec17

Please sign in to comment.