diff --git a/GPUSimpleVector.h b/GPUSimpleVector.h index 66525d1..aa1b86c 100644 --- a/GPUSimpleVector.h +++ b/GPUSimpleVector.h @@ -6,7 +6,11 @@ #include #include +#if defined DIGI_CUDA #include +#elif defined DIGI_CUPLA +#include +#endif namespace GPU { template struct SimpleVector { @@ -51,7 +55,35 @@ template struct SimpleVector { return T(); //undefined behaviour } -#ifdef __CUDACC__ +#ifdef DIGI_CUPLA + + template + ALPAKA_FN_ACC + int push_back(T_Acc const& acc, const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + ALPAKA_FN_ACC + int emplace_back(T_Acc const& acc, Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + +#elif defined __CUDACC__ // thread-safe version of the vector, when used in a CUDA kernel __device__ @@ -80,6 +112,7 @@ template struct SimpleVector { } #endif // __CUDACC__ + inline constexpr bool empty() const { return m_size==0;} inline constexpr bool full() const { return m_size==m_capacity;} inline constexpr T& operator[](int i) { return m_data[i]; } diff --git a/Makefile b/Makefile index 7323700..965d47b 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,18 @@ -TARGETS = naive cuda +TARGETS = naive cuda cupla-cuda cupla-serial .PHONY: default all debug clean $(TARGETS) -CXXFLAGS := -O2 -std=c++17 -CXX := g++ +CXX := g++-7 +CXX_FLAGS := -O2 -std=c++14 -ftemplate-depth-512 +CXX_DEBUG := -g -CUDAFLAGS := -O2 -std=c++14 --expt-relaxed-constexpr -NVCC := nvcc +NVCC := /usr/local/cuda-10.0/bin/nvcc -ccbin $(CXX) +NVCC_FLAGS := -O2 -std=c++14 --expt-relaxed-constexpr +NVCC_DEBUG := -g -lineinfo + +ALPAKA_BASE := $(HOME)/src/alpaka/alpaka +CUPLA_BASE := $(HOME)/src/alpaka/cupla +CUPLA_FLAGS := -DALPAKA_DEBUG=0 -DCUPLA_STREAM_ASYNC_ENABLED=1 -I$(ALPAKA_BASE)/include -I$(CUPLA_BASE)/include -L$(CUPLA_BASE)/lib default: naive @@ -17,18 +23,38 @@ debug: $(TARGETS:%=debug-%) clean: rm -f $(TARGETS:%=main-%) $(TARGETS:%=debug-%) +# Naive CPU implementation naive: main-naive main-naive: main.cc rawtodigi_naive.h - $(CXX) $(CXXFLAGS) -DDIGI_NAIVE -o main-naive main.cc + $(CXX) $(CXX_FLAGS) -DDIGI_NAIVE -o main-naive main.cc debug-naive: main.cc rawtodigi_naive.h - $(CXX) $(CXXFLAGS) -DDIGI_NAIVE -g -o debug-naive main.cc + $(CXX) $(CXX_FLAGS) -DDIGI_NAIVE $(CXX_DEBUG) -o debug-naive main.cc +# CUDA implementation cuda: main-cuda main-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h - $(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu + $(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu debug-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h - $(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -g -lineinfo -o debug-cuda main.cc rawtodigi_cuda.cu + $(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA $(NVCC_DEBUG) -o debug-cuda main.cc rawtodigi_cuda.cu + +# Alpaka/cupla implementation, with CUDA backend +cupla-cuda: main-cupla-cuda + +main-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h + $(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda -o main-cupla-cuda main.cc rawtodigi_cupla.cc + +debug-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h + $(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda $(NVCC_DEBUG) -o debug-cupla-cuda main.cc rawtodigi_cupla.cc + +# Alpaka/cupla implementation, with serial cpu backend +cupla-serial: main-cupla-serial + +main-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h + $(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread -o main-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial + +debug-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h + $(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread $(CXX_DEBUG) -o debug-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial diff --git a/README.md b/README.md index 9d26150..b1a2eef 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,12 @@ The purpose of this test program is to experiment with various ## Current implementations -| Implementation | Execuable | `make` target | `#ifdef` macro | -|----------------|--------------|---------------|----------------| -| Naive CPU | `main-naive` |`naive` | `DIGI_NAIVE` | -| CUDA | `main-cuda` |`cuda` | `DIGI_CUDA` | +| Implementation | Execuable | `make` target | `#ifdef` macro | +|----------------|----------------------|---------------|----------------| +| Naive CPU | `main-naive` |`naive` | `DIGI_NAIVE` | +| CUDA | `main-cuda` |`cuda` | `DIGI_CUDA` | +| Cupla on CPU | `main-cupla-serial` |`cupla-serial` | `DIGI_CUPLA` | +| Cupla on GPU | `main-cupla-cuda` |`cupla-cuda` | `DIGI_CUPLA` | ### Naive CPU @@ -20,6 +22,37 @@ The CUDA test program requires a recent CUDA version (`nvcc` supporting C++14 and `--expt-relaxed-constexpr`) and a machine with GPU. +### Cupla + +The Cupla test program can be compiled for different backends; so far it has +been tested with the CUDA backend (`-DALPAKA_ACC_GPU_CUDA_ENABLED`) and the +serial CPU backend (`-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED`). The CUDA backend +requires CUDA 9.2 or CUDA 10.0, and has been tested with gcc 8. + +In fact, the Cupla libraries need to be built before they can be used: +```bash +export ALPAKA_ROOT=$HOME/src/alpaka/alpaka +export CUPLA_ROOT=$HOME/src/alpaka/cupla + +git clone git@github.com:ComputationalRadiationPhysics/alpaka.git -b 0.3.5 $ALPAKA_ROOT +git clone git@github.com:ComputationalRadiationPhysics/cupla.git -b 0.1.1 $CUPLA_ROOT + +mkdir -p $CUPLA_ROOT/build $CUPLA_ROOT/lib +cd $CUPLA_ROOT/build +FILES="$CUPLA_ROOT/src/*.cpp $CUPLA_ROOT/src/manager/*.cpp" + +for FILE in $FILES; do + $NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -x cu -c $FILE -o cuda_$(basename $FILE).o +done +$NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -shared cuda_*.o -o $CUPLA_ROOT/lib/libcupla-cuda.so + +for FILE in $FILES; do + $CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -c $FILE -o serial_$(basename $FILE).o +done +$CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -shared serial_*.o -o $CUPLA_ROOT/lib/libcupla-serial.so + +``` + ## How to add a new implementation? - Copy of (e.g.) the `rawtodigi_naive.h` for the new *thing* (with name of the *thing* after the underscore) diff --git a/cuplaCheck.h b/cuplaCheck.h new file mode 100644 index 0000000..9f46f2c --- /dev/null +++ b/cuplaCheck.h @@ -0,0 +1,30 @@ +#ifndef HeterogeneousCore_CUDAUtilities_cuplaCheck_h +#define HeterogeneousCore_CUDAUtilities_cuplaCheck_h + +#include + +/* Do NOT include other headers that use CUDA runtime functions or variables + * (see above) before this include. + * The reason for this is that cupla renames CUDA host functions and device build in + * variables by using macros and macro functions. + * Do NOT include other specific includes such as `` (driver functions, + * etc.). + */ +#include + +inline +bool cuplaCheck_(const char* file, int line, const char* cmd, cuplaError_t result) +{ + //std::cerr << file << ", line " << line << ": " << cmd << std::endl; + if (result == cuplaSuccess) + return true; + + const char* message = cuplaGetErrorString(result); + std::cerr << file << ", line " << line << ": " << message << std::endl; + abort(); + return false; +} + +#define cuplaCheck(ARG) (cuplaCheck_(__FILE__, __LINE__, #ARG, (ARG))) + +#endif // HeterogeneousCore_CUDAUtilities_cuplaCheck_h diff --git a/main.cc b/main.cc index 26789de..0183c1c 100644 --- a/main.cc +++ b/main.cc @@ -12,6 +12,10 @@ #include #include "rawtodigi_cuda.h" #include "cudaCheck.h" +#elif defined DIGI_CUPLA +#include +#include "rawtodigi_cupla.h" +#include "cuplaCheck.h" #endif namespace { @@ -46,6 +50,9 @@ int main() { #ifdef DIGI_CUDA cudaStream_t stream; cudaStreamCreate(&stream); +#elif defined DIGI_CUPLA + cuplaStream_t stream; + cuplaStreamCreate(&stream); #endif int totaltime = 0; @@ -100,6 +107,37 @@ int main() { cudaFree(input_d); cudaFreeHost(output_h); cudaFreeHost(input_h); +#elif defined DIGI_CUPLA + Input *input_d, *input_h; + cuplaCheck(cuplaMalloc((void **) &input_d, sizeof(Input))); + cuplaCheck(cuplaMallocHost((void **) &input_h, sizeof(Input))); + std::memcpy(input_h, &input, sizeof(Input)); + + Output *output_d, *output_h; + cuplaCheck(cuplaMalloc((void **) &output_d, sizeof(Output))); + cuplaCheck(cuplaMallocHost((void **) &output_h, sizeof(Output))); + output_h->err.construct(pixelgpudetails::MAX_FED_WORDS, output_d->err_d); + + auto start = std::chrono::high_resolution_clock::now(); + cuplaCheck(cuplaMemcpyAsync(input_d, input_h, sizeof(Input), cuplaMemcpyHostToDevice, stream)); + cuplaCheck(cuplaMemcpyAsync(output_d, output_h, sizeof(Output), cuplaMemcpyHostToDevice, stream)); + + cupla::rawtodigi(input_d, output_d, + input.wordCounter, + true, true, false, stream); + + cuplaCheck(cuplaMemcpyAsync(output_h, output_d, sizeof(Output), cuplaMemcpyDeviceToHost, stream)); + cuplaCheck(cuplaStreamSynchronize(stream)); + auto stop = std::chrono::high_resolution_clock::now(); + + output_h->err.set_data(output_h->err_d); + std::memcpy(output.get(), output_h, sizeof(Output)); + output->err.set_data(output->err_d); + + cuplaFree(output_d); + cuplaFree(input_d); + cuplaFreeHost(output_h); + cuplaFreeHost(input_h); #endif auto diff = stop - start; @@ -113,6 +151,8 @@ int main() { #ifdef DIGI_CUDA cudaStreamDestroy(stream); +#elif defined DIGI_CUPLA + cuplaStreamDestroy(stream); #endif return 0; diff --git a/output.h b/output.h index 0e8bac9..762cb3b 100644 --- a/output.h +++ b/output.h @@ -5,7 +5,7 @@ #include "pixelgpudetails.h" -#ifdef DIGI_CUDA +#if defined DIGI_CUDA || DIGI_CUPLA #include "GPUSimpleVector.h" #endif @@ -20,7 +20,7 @@ struct alignas(128) Output { #ifdef DIGI_NAIVE std::vector err; -#elif defined DIGI_CUDA +#elif defined DIGI_CUDA || defined DIGI_CUPLA PixelErrorCompact err_d[pixelgpudetails::MAX_FED_WORDS]; GPU::SimpleVector err; #endif diff --git a/rawtodigi_cupla.cc b/rawtodigi_cupla.cc new file mode 100644 index 0000000..4ce5381 --- /dev/null +++ b/rawtodigi_cupla.cc @@ -0,0 +1,511 @@ +#include +#include + +/* Do NOT include other headers that use CUDA runtime functions or variables + * (see above) before this include. + * The reason for this is that cupla renames CUDA host functions and device build in + * variables by using macros and macro functions. + * Do NOT include other specific includes such as `` (driver functions, + * etc.). + */ +#include + +#include "rawtodigi_cupla.h" +#include "cuplaCheck.h" + +namespace cupla { + class Packing { + public: + using PackedDigiType = uint32_t; + + // Constructor: pre-computes masks and shifts from field widths + ALPAKA_FN_HOST_ACC + inline + constexpr Packing(unsigned int row_w, unsigned int column_w, + unsigned int time_w, unsigned int adc_w) : + row_width(row_w), + column_width(column_w), + adc_width(adc_w), + row_shift(0), + column_shift(row_shift + row_w), + time_shift(column_shift + column_w), + adc_shift(time_shift + time_w), + row_mask(~(~0U << row_w)), + column_mask( ~(~0U << column_w)), + time_mask(~(~0U << time_w)), + adc_mask(~(~0U << adc_w)), + rowcol_mask(~(~0U << (column_w+row_w))), + max_row(row_mask), + max_column(column_mask), + max_adc(adc_mask) + { } + + uint32_t row_width; + uint32_t column_width; + uint32_t adc_width; + + uint32_t row_shift; + uint32_t column_shift; + uint32_t time_shift; + uint32_t adc_shift; + + PackedDigiType row_mask; + PackedDigiType column_mask; + PackedDigiType time_mask; + PackedDigiType adc_mask; + PackedDigiType rowcol_mask; + + uint32_t max_row; + uint32_t max_column; + uint32_t max_adc; + }; + + ALPAKA_FN_HOST_ACC + inline + constexpr Packing packing() { + return Packing(11, 11, 0, 10); + } + + + ALPAKA_FN_HOST_ACC + inline + uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) { + constexpr Packing thePacking = packing(); + adc = std::min(adc, thePacking.max_adc); + + return (row << thePacking.row_shift) | + (col << thePacking.column_shift) | + (adc << thePacking.adc_shift); + } + + + ALPAKA_FN_HOST_ACC + uint32_t getLink(uint32_t ww) { + return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask); + } + + + ALPAKA_FN_HOST_ACC + uint32_t getRoc(uint32_t ww) { + return ((ww >> pixelgpudetails::ROC_shift ) & pixelgpudetails::ROC_mask); + } + + + ALPAKA_FN_HOST_ACC + uint32_t getADC(uint32_t ww) { + return ((ww >> pixelgpudetails::ADC_shift) & pixelgpudetails::ADC_mask); + } + + + ALPAKA_FN_HOST_ACC + bool isBarrel(uint32_t rawId) { + return (1==((rawId>>25)&0x7)); + } + + ALPAKA_FN_HOST_ACC + bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) + { + constexpr uint32_t numRowsInRoc = 80; + constexpr uint32_t numColsInRoc = 52; + + /// row and collumn in ROC representation + return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc)); + } + + ALPAKA_FN_HOST_ACC + bool dcolIsValid(uint32_t dcol, uint32_t pxid) + { + return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); + } + + ALPAKA_FN_HOST_ACC + pixelgpudetails::DetIdGPU getRawId(const SiPixelFedCablingMapGPU * cablingMap, uint8_t fed, uint32_t link, uint32_t roc) { + uint32_t index = fed * pixelgpudetails::MAX_LINK * pixelgpudetails::MAX_ROC + (link-1) * pixelgpudetails::MAX_ROC + roc; + pixelgpudetails::DetIdGPU detId = { cablingMap->RawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index] }; + return detId; + } + + ALPAKA_FN_HOST_ACC + pixelgpudetails::Pixel frameConversion(bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) { + + int slopeRow = 0, slopeCol = 0; + int rowOffset = 0, colOffset = 0; + + if (bpix) { + + if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1 + if (rocIdInDetUnit < 8) { + slopeRow = 1; + slopeCol = -1; + rowOffset = 0; + colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1; + } + else { + slopeRow = -1; + slopeCol = 1; + rowOffset = 2*pixelgpudetails::numRowsInRoc-1; + colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc; + } // if roc + } + else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1 + if (rocIdInDetUnit < 8) { + slopeRow = -1; + slopeCol = 1; + rowOffset = 2*pixelgpudetails::numRowsInRoc-1; + colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc; + } + else { + slopeRow = 1; + slopeCol = -1; + rowOffset = 0; + colOffset = (16-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1; + } + } + + } + else { // fpix + if (side==-1) { // pannel 1 + if (rocIdInDetUnit < 8) { + slopeRow = 1; + slopeCol = -1; + rowOffset = 0; + colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1; + } + else { + slopeRow = -1; + slopeCol = 1; + rowOffset = 2*pixelgpudetails::numRowsInRoc-1; + colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc; + } + } + else { // pannel 2 + if (rocIdInDetUnit < 8) { + slopeRow = 1; + slopeCol = -1; + rowOffset = 0; + colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1; + } + else { + slopeRow = -1; + slopeCol = 1; + rowOffset = 2*pixelgpudetails::numRowsInRoc-1; + colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc; + } + + } // side + + } + + uint32_t gRow = rowOffset+slopeRow*local.row; + uint32_t gCol = colOffset+slopeCol*local.col; + //printf("Inside frameConversion row: %u, column: %u\n", gRow, gCol); + pixelgpudetails::Pixel global = {gRow, gCol}; + return global; + } + + ALPAKA_FN_HOST_ACC + uint8_t conversionError(uint8_t fedId, uint8_t status, bool debug = false) + { + // debug = true; + + if(debug) { + switch (status) { + case(1) : { + printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId ); + break; + } + case(2) : { + printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId); + break; + } + case(3) : { + printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId); + break; + } + case(4) : { + printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId); + break; + } + default: + if (debug) printf("Cabling check returned unexpected result, status = %i\n", status); + }; + } + + if(status >= 1 and status <= 4) { + return status + 34; + } + return 0; + } + + ALPAKA_FN_HOST_ACC + uint32_t getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false) + { + uint32_t rID = 0xffffffff; + + switch (errorType) { + case 25 : case 30 : case 31 : case 36 : case 40 : { + //set dummy values for cabling just to get detId from link + //cabling.dcol = 0; + //cabling.pxid = 2; + constexpr uint32_t roc = 1; + const uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask; + const uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId; + if (rID_temp != 9999) rID = rID_temp; + break; + } + case 29 : { + int chanNmbr = 0; + constexpr int DB0_shift = 0; + constexpr int DB1_shift = DB0_shift + 1; + constexpr int DB2_shift = DB1_shift + 1; + constexpr int DB3_shift = DB2_shift + 1; + constexpr int DB4_shift = DB3_shift + 1; + constexpr uint32_t DataBit_mask = ~(~uint32_t(0) << 1); + + const int CH1 = (errWord >> DB0_shift) & DataBit_mask; + const int CH2 = (errWord >> DB1_shift) & DataBit_mask; + const int CH3 = (errWord >> DB2_shift) & DataBit_mask; + const int CH4 = (errWord >> DB3_shift) & DataBit_mask; + const int CH5 = (errWord >> DB4_shift) & DataBit_mask; + constexpr int BLOCK_bits = 3; + constexpr int BLOCK_shift = 8; + constexpr uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits); + const int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask; + const int localCH = 1*CH1+2*CH2+3*CH3+4*CH4+5*CH5; + if (BLOCK%2==0) chanNmbr=(BLOCK/2)*9+localCH; + else chanNmbr = ((BLOCK-1)/2)*9+4+localCH; + if ((chanNmbr < 1)||(chanNmbr > 36)) break; // signifies unexpected result + + // set dummy values for cabling just to get detId from link if in Barrel + //cabling.dcol = 0; + //cabling.pxid = 2; + constexpr uint32_t roc = 1; + const uint32_t link = chanNmbr; + const uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId; + if(rID_temp != 9999) rID = rID_temp; + break; + } + case 37 : case 38: { + //cabling.dcol = 0; + //cabling.pxid = 2; + const uint32_t roc = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask; + const uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask; + const uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId; + if(rID_temp != 9999) rID = rID_temp; + break; + } + default: + break; + }; + + return rID; + } + + ALPAKA_FN_HOST_ACC + uint8_t checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false) + { + uint8_t errorType = (errorWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ERROR_mask; + if (errorType < 25) return 0; + bool errorFound = false; + + switch (errorType) { + case(25) : { + errorFound = true; + uint32_t index = fedId * pixelgpudetails::MAX_LINK * pixelgpudetails::MAX_ROC + (link-1) * pixelgpudetails::MAX_ROC + 1; + if (index > 1 && index <= cablingMap->size) { + if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index])) errorFound = false; + } + if (debug and errorFound) printf("Invalid ROC = 25 found (errorType = 25)\n"); + break; + } + case(26) : { + if (debug) printf("Gap word found (errorType = 26)\n"); + errorFound = true; + break; + } + case(27) : { + if (debug) printf("Dummy word found (errorType = 27)\n"); + errorFound = true; + break; + } + case(28) : { + if (debug) printf("Error fifo nearly full (errorType = 28)\n"); + errorFound = true; + break; + } + case(29) : { + if (debug) printf("Timeout on a channel (errorType = 29)\n"); + if ((errorWord >> pixelgpudetails::OMIT_ERR_shift) & pixelgpudetails::OMIT_ERR_mask) { + if (debug) printf("...first errorType=29 error, this gets masked out\n"); + } + errorFound = true; + break; + } + case(30) : { + if (debug) printf("TBM error trailer (errorType = 30)\n"); + int StateMatch_bits = 4; + int StateMatch_shift = 8; + uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits); + int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask; + if ( StateMatch != 1 && StateMatch != 8 ) { + if (debug) printf("FED error 30 with unexpected State Bits (errorType = 30)\n"); + } + if (StateMatch == 1) errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30 + errorFound = true; + break; + } + case(31) : { + if (debug) printf("Event number error (errorType = 31)\n"); + errorFound = true; + break; + } + default: + errorFound = false; + }; + + return errorFound ? errorType : 0; + } + + + struct rawtodigi_kernel { + + template + ALPAKA_FN_ACC + void operator()(T_Acc const& acc, const Input *input, Output *output, + bool useQualityInfo, bool includeErrors, bool debug) const + { + const SiPixelFedCablingMapGPU* cablingMap = &input->cablingMap; + const uint32_t wordCounter = input->wordCounter; + const uint32_t* word = input->word; + const uint8_t* fedIds =input->fedId; + uint16_t* xx = output->xx; + uint16_t* yy = output->yy; + uint16_t* adc = output->adc; + uint32_t* pdigi = output->digi; + uint32_t* rawIdArr = output->rawIdArr; + uint16_t* moduleId = output->moduleInd; + GPU::SimpleVector* err = &output->err; + + + int32_t first = threadIdx.x + blockIdx.x*blockDim.x; + for (int32_t iloop=first, nend=wordCounter; ilooppush_back(acc, PixelErrorCompact{rID, ww, errorType, fedId}); + continue; + } + + uint32_t rawId = detId.RawId; + uint32_t rocIdInDetUnit = detId.rocInDet; + bool barrel = isBarrel(rawId); + + uint32_t index = fedId * pixelgpudetails::MAX_LINK * pixelgpudetails::MAX_ROC + (link-1) * pixelgpudetails::MAX_ROC + roc; + if (useQualityInfo) { + skipROC = cablingMap->badRocs[index]; + if (skipROC) continue; + } + + uint32_t layer = 0;//, ladder =0; + int side = 0, panel = 0, module = 0;//disk = 0, blade = 0 + + if (barrel) + { + layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask; + module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask; + side = (module < 5)? -1 : 1; + } + else { + // endcap ids + layer = 0; + panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask; + //disk = (rawId >> diskStartBit_) & diskMask_; + side = (panel == 1)? -1 : 1; + //blade = (rawId >> bladeStartBit_) & bladeMask_; + } + + // ***special case of layer to 1 be handled here + pixelgpudetails::Pixel localPix; + if (layer == 1) { + uint32_t col = (ww >> pixelgpudetails::COL_shift) & pixelgpudetails::COL_mask; + uint32_t row = (ww >> pixelgpudetails::ROW_shift) & pixelgpudetails::ROW_mask; + localPix.row = row; + localPix.col = col; + if (includeErrors) { + if (not rocRowColIsValid(row, col)) { + uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays + err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId}); + if(debug) printf("BPIX1 Error status: %i\n", error); + continue; + } + } + } else { + // ***conversion rules for dcol and pxid + uint32_t dcol = (ww >> pixelgpudetails::DCOL_shift) & pixelgpudetails::DCOL_mask; + uint32_t pxid = (ww >> pixelgpudetails::PXID_shift) & pixelgpudetails::PXID_mask; + uint32_t row = pixelgpudetails::numRowsInRoc - pxid/2; + uint32_t col = dcol*2 + pxid%2; + localPix.row = row; + localPix.col = col; + if (includeErrors and not dcolIsValid(dcol, pxid)) { + uint8_t error = conversionError(fedId, 3, debug); + err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId}); + if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc); + continue; + } + } + + pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix); + xx[gIndex] = globalPix.row; // origin shifting by 1 0-159 + yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 + adc[gIndex] = getADC(ww); + pdigi[gIndex] = pack(globalPix.row, globalPix.col, adc[gIndex]); + moduleId[gIndex] = detId.moduleId; + rawIdArr[gIndex] = rawId; + } // end of loop (gIndex < end) + + } // end of Raw to Digi kernel + + }; + + void rawtodigi(const Input *input_d, Output *output_d, + const uint32_t wordCounter, + bool useQualityInfo, bool includeErrors, bool debug, cuplaStream_t stream) { + const int threadsPerBlock = 512; + const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all + + CUPLA_KERNEL_OPTI(rawtodigi_kernel)(blocks, threadsPerBlock, 0, stream)(input_d, + output_d, + useQualityInfo, + includeErrors, + debug); + cuplaCheck(cuplaGetLastError()); + } + +} // end namespace cupla diff --git a/rawtodigi_cupla.h b/rawtodigi_cupla.h new file mode 100644 index 0000000..6c3f821 --- /dev/null +++ b/rawtodigi_cupla.h @@ -0,0 +1,24 @@ +#ifndef RAWTODIGI_CUPLA_H +#define RAWTODIGI_CUPLA_H + +/* Do NOT include other headers that use CUDA runtime functions or variables + * (see above) before this include. + * The reason for this is that cupla renames CUDA host functions and device build in + * variables by using macros and macro functions. + * Do NOT include other specific includes such as `` (driver functions, + * etc.). + */ +#include + +#include "pixelgpudetails.h" +#include "GPUSimpleVector.h" +#include "input.h" +#include "output.h" + +namespace cupla { + void rawtodigi(const Input *input_d, Output *output_d, + const uint32_t wordCounter, + bool useQualityInfo, bool includeErrors, bool debug, cudaStream_t stream); +} + +#endif