diff --git a/GPUSimpleVector.h b/GPUSimpleVector.h
index 66525d1..aa1b86c 100644
--- a/GPUSimpleVector.h
+++ b/GPUSimpleVector.h
@@ -6,7 +6,11 @@
 #include <type_traits>
 #include <utility>
 
+#if defined DIGI_CUDA
 #include <cuda.h>
+#elif defined DIGI_CUPLA
+#include <cuda_to_cupla.hpp>
+#endif
 
 namespace GPU {
 template <class T> struct SimpleVector {
@@ -51,7 +55,35 @@ template <class T> struct SimpleVector {
       return T(); //undefined behaviour
   }
 
-#ifdef __CUDACC__
+#ifdef DIGI_CUPLA
+
+  template <typename T_Acc>
+  ALPAKA_FN_ACC
+  int push_back(T_Acc const& acc, const T &element) {
+    auto previousSize = atomicAdd(&m_size, 1);
+    if (previousSize < m_capacity) {
+      m_data[previousSize] = element;
+      return previousSize;
+    } else {
+      atomicSub(&m_size, 1);
+      return -1;
+    }
+  }
+
+  template <typename T_Acc, class... Ts>
+  ALPAKA_FN_ACC
+  int emplace_back(T_Acc const& acc, Ts &&... args) {
+    auto previousSize = atomicAdd(&m_size, 1);
+    if (previousSize < m_capacity) {
+      (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+      return previousSize;
+    } else {
+      atomicSub(&m_size, 1);
+      return -1;
+    }
+  }
+
+#elif defined __CUDACC__
 
   // thread-safe version of the vector, when used in a CUDA kernel
   __device__
@@ -80,6 +112,7 @@ template <class T> struct SimpleVector {
   }
 
 #endif // __CUDACC__
+
   inline constexpr bool empty() const { return m_size==0;}
   inline constexpr bool full() const { return m_size==m_capacity;}
   inline constexpr T& operator[](int i) { return m_data[i]; }
diff --git a/Makefile b/Makefile
index 7323700..965d47b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,18 @@
-TARGETS = naive cuda
+TARGETS = naive cuda cupla-cuda cupla-serial
 
 .PHONY: default all debug clean $(TARGETS)
 
-CXXFLAGS := -O2 -std=c++17
-CXX := g++
+CXX := g++-7
+CXX_FLAGS := -O2 -std=c++14 -ftemplate-depth-512
+CXX_DEBUG := -g
 
-CUDAFLAGS := -O2 -std=c++14 --expt-relaxed-constexpr
-NVCC := nvcc
+NVCC := /usr/local/cuda-10.0/bin/nvcc -ccbin $(CXX)
+NVCC_FLAGS := -O2 -std=c++14 --expt-relaxed-constexpr
+NVCC_DEBUG := -g -lineinfo
+
+ALPAKA_BASE := $(HOME)/src/alpaka/alpaka
+CUPLA_BASE  := $(HOME)/src/alpaka/cupla
+CUPLA_FLAGS := -DALPAKA_DEBUG=0 -DCUPLA_STREAM_ASYNC_ENABLED=1 -I$(ALPAKA_BASE)/include -I$(CUPLA_BASE)/include -L$(CUPLA_BASE)/lib
 
 default: naive
 
@@ -17,18 +23,38 @@ debug: $(TARGETS:%=debug-%)
 clean:
 	rm -f $(TARGETS:%=main-%) $(TARGETS:%=debug-%)
 
+# Naive CPU implementation
 naive: main-naive
 
 main-naive: main.cc rawtodigi_naive.h
-	$(CXX) $(CXXFLAGS) -DDIGI_NAIVE -o main-naive main.cc
+	$(CXX) $(CXX_FLAGS) -DDIGI_NAIVE -o main-naive main.cc
 
 debug-naive: main.cc rawtodigi_naive.h
-	$(CXX) $(CXXFLAGS) -DDIGI_NAIVE -g -o debug-naive main.cc
+	$(CXX) $(CXX_FLAGS) -DDIGI_NAIVE $(CXX_DEBUG) -o debug-naive main.cc
 
+# CUDA implementation
 cuda: main-cuda
 
 main-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h
-	$(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu
+	$(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA -o main-cuda main.cc rawtodigi_cuda.cu
 
 debug-cuda: main.cc rawtodigi_cuda.cu rawtodigi_cuda.h
-	$(NVCC) $(CUDAFLAGS) -DDIGI_CUDA -g -lineinfo -o debug-cuda main.cc rawtodigi_cuda.cu
+	$(NVCC) $(NVCC_FLAGS) -DDIGI_CUDA $(NVCC_DEBUG) -o debug-cuda main.cc rawtodigi_cuda.cu
+
+# Alpaka/cupla implementation, with CUDA backend
+cupla-cuda: main-cupla-cuda
+
+main-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda -o main-cupla-cuda main.cc rawtodigi_cupla.cc
+
+debug-cupla-cuda: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(NVCC) -x cu -w $(NVCC_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_GPU_CUDA_ENABLED $(CUPLA_FLAGS) -lcupla-cuda $(NVCC_DEBUG) -o debug-cupla-cuda main.cc rawtodigi_cupla.cc
+
+# Alpaka/cupla implementation, with serial cpu backend
+cupla-serial: main-cupla-serial
+
+main-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread -o main-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial
+
+debug-cupla-serial: main.cc rawtodigi_cupla.cc rawtodigi_cupla.h
+	$(CXX) $(CXX_FLAGS) -DDIGI_CUPLA -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $(CUPLA_FLAGS) -pthread $(CXX_DEBUG) -o debug-cupla-serial main.cc rawtodigi_cupla.cc -lcupla-serial
diff --git a/README.md b/README.md
index 9d26150..b1a2eef 100644
--- a/README.md
+++ b/README.md
@@ -5,10 +5,12 @@ The purpose of this test program is to experiment with various
 
 ## Current implementations
 
-| Implementation | Execuable    | `make` target | `#ifdef` macro |
-|----------------|--------------|---------------|----------------|
-| Naive CPU      | `main-naive` |`naive`        | `DIGI_NAIVE`   |
-| CUDA           | `main-cuda`  |`cuda`         | `DIGI_CUDA`    |
+| Implementation | Execuable            | `make` target | `#ifdef` macro |
+|----------------|----------------------|---------------|----------------|
+| Naive CPU      | `main-naive`         |`naive`        | `DIGI_NAIVE`   |
+| CUDA           | `main-cuda`          |`cuda`         | `DIGI_CUDA`    |
+| Cupla on CPU   | `main-cupla-serial`  |`cupla-serial` | `DIGI_CUPLA`   |
+| Cupla on GPU   | `main-cupla-cuda`    |`cupla-cuda`   | `DIGI_CUPLA`   |
 
 ### Naive CPU
 
@@ -20,6 +22,37 @@ The CUDA test program requires a recent CUDA version (`nvcc`
 supporting C++14 and `--expt-relaxed-constexpr`) and a machine with
 GPU.
 
+### Cupla
+
+The Cupla test program can be compiled for different backends; so far it has
+been tested with the CUDA backend (`-DALPAKA_ACC_GPU_CUDA_ENABLED`) and the
+serial CPU backend (`-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED`). The CUDA backend
+requires CUDA 9.2 or CUDA 10.0, and has been tested with gcc 8.
+
+In fact, the Cupla libraries need to be built before they can be used:
+```bash
+export ALPAKA_ROOT=$HOME/src/alpaka/alpaka
+export CUPLA_ROOT=$HOME/src/alpaka/cupla
+
+git clone git@github.com:ComputationalRadiationPhysics/alpaka.git -b 0.3.5 $ALPAKA_ROOT
+git clone git@github.com:ComputationalRadiationPhysics/cupla.git  -b 0.1.1 $CUPLA_ROOT
+
+mkdir -p $CUPLA_ROOT/build $CUPLA_ROOT/lib
+cd $CUPLA_ROOT/build
+FILES="$CUPLA_ROOT/src/*.cpp $CUPLA_ROOT/src/manager/*.cpp"
+
+for FILE in $FILES; do
+  $NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -x cu -c $FILE -o cuda_$(basename $FILE).o
+done
+$NVCC -DALPAKA_ACC_GPU_CUDA_ENABLED $NVCC_FLAGS -shared cuda_*.o -o $CUPLA_ROOT/lib/libcupla-cuda.so
+
+for FILE in $FILES; do
+  $CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -c $FILE -o serial_$(basename $FILE).o
+done
+$CXX -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED $CXX_FLAGS -shared serial_*.o -o $CUPLA_ROOT/lib/libcupla-serial.so
+
+```
+
 ## How to add a new implementation?
 
 - Copy of (e.g.) the `rawtodigi_naive.h` for the new *thing* (with name of the *thing* after the underscore)
diff --git a/cuplaCheck.h b/cuplaCheck.h
new file mode 100644
index 0000000..9f46f2c
--- /dev/null
+++ b/cuplaCheck.h
@@ -0,0 +1,30 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cuplaCheck_h
+#define HeterogeneousCore_CUDAUtilities_cuplaCheck_h
+
+#include <iostream>
+
+/* Do NOT include other headers that use CUDA runtime functions or variables
+ * (see above) before this include.
+ * The reason for this is that cupla renames CUDA host functions and device build in
+ * variables by using macros and macro functions.
+ * Do NOT include other specific includes such as `<cuda.h>` (driver functions,
+ * etc.).
+ */
+#include <cuda_to_cupla.hpp>
+
+inline
+bool cuplaCheck_(const char* file, int line, const char* cmd, cuplaError_t result)
+{
+  //std::cerr << file << ", line " << line << ": " << cmd << std::endl;
+  if (result == cuplaSuccess)
+    return true;
+
+  const char* message = cuplaGetErrorString(result);
+  std::cerr << file << ", line " << line << ": " << message << std::endl;
+  abort();
+  return false;
+}
+
+#define cuplaCheck(ARG) (cuplaCheck_(__FILE__, __LINE__, #ARG, (ARG)))
+
+#endif // HeterogeneousCore_CUDAUtilities_cuplaCheck_h
diff --git a/main.cc b/main.cc
index 26789de..0183c1c 100644
--- a/main.cc
+++ b/main.cc
@@ -12,6 +12,10 @@
 #include <cuda_runtime.h>
 #include "rawtodigi_cuda.h"
 #include "cudaCheck.h"
+#elif defined DIGI_CUPLA
+#include <cuda_to_cupla.hpp>
+#include "rawtodigi_cupla.h"
+#include "cuplaCheck.h"
 #endif
 
 namespace {
@@ -46,6 +50,9 @@ int main() {
 #ifdef DIGI_CUDA
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+#elif defined DIGI_CUPLA
+  cuplaStream_t stream;
+  cuplaStreamCreate(&stream);
 #endif
 
   int totaltime = 0;
@@ -100,6 +107,37 @@ int main() {
     cudaFree(input_d);
     cudaFreeHost(output_h);
     cudaFreeHost(input_h);
+#elif defined DIGI_CUPLA
+    Input *input_d, *input_h;
+    cuplaCheck(cuplaMalloc((void **) &input_d, sizeof(Input)));
+    cuplaCheck(cuplaMallocHost((void **) &input_h, sizeof(Input)));
+    std::memcpy(input_h, &input, sizeof(Input));
+
+    Output *output_d, *output_h;
+    cuplaCheck(cuplaMalloc((void **) &output_d, sizeof(Output)));
+    cuplaCheck(cuplaMallocHost((void **) &output_h, sizeof(Output)));
+    output_h->err.construct(pixelgpudetails::MAX_FED_WORDS, output_d->err_d);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    cuplaCheck(cuplaMemcpyAsync(input_d, input_h, sizeof(Input), cuplaMemcpyHostToDevice, stream));
+    cuplaCheck(cuplaMemcpyAsync(output_d, output_h, sizeof(Output), cuplaMemcpyHostToDevice, stream));
+
+    cupla::rawtodigi(input_d, output_d,
+                    input.wordCounter,
+                    true, true, false, stream);
+
+    cuplaCheck(cuplaMemcpyAsync(output_h, output_d, sizeof(Output), cuplaMemcpyDeviceToHost, stream));
+    cuplaCheck(cuplaStreamSynchronize(stream));
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    output_h->err.set_data(output_h->err_d);
+    std::memcpy(output.get(), output_h, sizeof(Output));
+    output->err.set_data(output->err_d);
+
+    cuplaFree(output_d);
+    cuplaFree(input_d);
+    cuplaFreeHost(output_h);
+    cuplaFreeHost(input_h);
 #endif
 
     auto diff = stop - start;
@@ -113,6 +151,8 @@ int main() {
 
 #ifdef DIGI_CUDA
   cudaStreamDestroy(stream);
+#elif defined DIGI_CUPLA
+  cuplaStreamDestroy(stream);
 #endif
   
   return 0;
diff --git a/output.h b/output.h
index 0e8bac9..762cb3b 100644
--- a/output.h
+++ b/output.h
@@ -5,7 +5,7 @@
 
 #include "pixelgpudetails.h"
 
-#ifdef DIGI_CUDA
+#if defined DIGI_CUDA || DIGI_CUPLA
 #include "GPUSimpleVector.h"
 #endif
 
@@ -20,7 +20,7 @@ struct alignas(128) Output {
 
 #ifdef DIGI_NAIVE
   std::vector<PixelErrorCompact> err;
-#elif defined DIGI_CUDA
+#elif defined DIGI_CUDA || defined DIGI_CUPLA
   PixelErrorCompact err_d[pixelgpudetails::MAX_FED_WORDS];
   GPU::SimpleVector<PixelErrorCompact> err;
 #endif
diff --git a/rawtodigi_cupla.cc b/rawtodigi_cupla.cc
new file mode 100644
index 0000000..4ce5381
--- /dev/null
+++ b/rawtodigi_cupla.cc
@@ -0,0 +1,511 @@
+#include <algorithm>
+#include <cstdio>
+
+/* Do NOT include other headers that use CUDA runtime functions or variables
+ * (see above) before this include.
+ * The reason for this is that cupla renames CUDA host functions and device build in
+ * variables by using macros and macro functions.
+ * Do NOT include other specific includes such as `<cuda.h>` (driver functions,
+ * etc.).
+ */
+#include <cuda_to_cupla.hpp>
+
+#include "rawtodigi_cupla.h"
+#include "cuplaCheck.h"
+
+namespace cupla {
+  class Packing {
+  public:
+    using PackedDigiType = uint32_t;
+
+    // Constructor: pre-computes masks and shifts from field widths
+    ALPAKA_FN_HOST_ACC
+    inline
+    constexpr Packing(unsigned int row_w, unsigned int column_w,
+                      unsigned int time_w, unsigned int adc_w) :
+      row_width(row_w),
+      column_width(column_w),
+      adc_width(adc_w),
+      row_shift(0),
+      column_shift(row_shift + row_w),
+      time_shift(column_shift + column_w),
+      adc_shift(time_shift + time_w),
+      row_mask(~(~0U << row_w)),
+      column_mask( ~(~0U << column_w)),
+      time_mask(~(~0U << time_w)),
+      adc_mask(~(~0U << adc_w)),
+      rowcol_mask(~(~0U << (column_w+row_w))),
+      max_row(row_mask),
+      max_column(column_mask),
+      max_adc(adc_mask)
+    { }
+
+    uint32_t  row_width;
+    uint32_t  column_width;
+    uint32_t  adc_width;
+
+    uint32_t  row_shift;
+    uint32_t  column_shift;
+    uint32_t  time_shift;
+    uint32_t  adc_shift;
+
+    PackedDigiType row_mask;
+    PackedDigiType column_mask;
+    PackedDigiType time_mask;
+    PackedDigiType adc_mask;
+    PackedDigiType rowcol_mask;
+
+    uint32_t  max_row;
+    uint32_t  max_column;
+    uint32_t  max_adc;
+  };
+
+  ALPAKA_FN_HOST_ACC
+  inline
+  constexpr Packing packing() {
+    return Packing(11, 11, 0, 10);
+  }
+
+
+  ALPAKA_FN_HOST_ACC
+  inline
+  uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {
+    constexpr Packing thePacking = packing();
+    adc = std::min(adc, thePacking.max_adc);
+
+    return (row << thePacking.row_shift) |
+           (col << thePacking.column_shift) |
+           (adc << thePacking.adc_shift);
+  }
+
+
+  ALPAKA_FN_HOST_ACC
+  uint32_t getLink(uint32_t ww)  {
+    return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask);
+  }
+
+
+  ALPAKA_FN_HOST_ACC
+  uint32_t getRoc(uint32_t ww) {
+    return ((ww >> pixelgpudetails::ROC_shift ) & pixelgpudetails::ROC_mask);
+  }
+
+
+  ALPAKA_FN_HOST_ACC
+  uint32_t getADC(uint32_t ww) {
+    return ((ww >> pixelgpudetails::ADC_shift) & pixelgpudetails::ADC_mask);
+  }
+
+
+  ALPAKA_FN_HOST_ACC
+  bool isBarrel(uint32_t rawId) {
+    return (1==((rawId>>25)&0x7));
+  }
+
+  ALPAKA_FN_HOST_ACC
+  bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol)
+  {
+    constexpr uint32_t numRowsInRoc = 80;
+    constexpr uint32_t numColsInRoc = 52;
+
+    /// row and collumn in ROC representation
+    return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
+  }
+
+  ALPAKA_FN_HOST_ACC
+  bool dcolIsValid(uint32_t dcol, uint32_t pxid)
+  {
+    return ((dcol < 26) &  (2 <= pxid) & (pxid < 162));
+  }
+
+  ALPAKA_FN_HOST_ACC
+  pixelgpudetails::DetIdGPU getRawId(const SiPixelFedCablingMapGPU * cablingMap, uint8_t fed, uint32_t link, uint32_t roc) {
+    uint32_t index = fed * pixelgpudetails::MAX_LINK * pixelgpudetails::MAX_ROC + (link-1) * pixelgpudetails::MAX_ROC + roc;
+    pixelgpudetails::DetIdGPU detId = { cablingMap->RawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index] };
+    return detId;
+  }
+
+  ALPAKA_FN_HOST_ACC
+  pixelgpudetails::Pixel frameConversion(bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
+
+    int slopeRow  = 0, slopeCol = 0;
+    int rowOffset = 0, colOffset = 0;
+
+    if (bpix) {
+
+      if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow  = -1;
+          slopeCol  = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        } // if roc
+      }
+      else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
+        if (rocIdInDetUnit < 8) {
+          slopeRow  = -1;
+          slopeCol  =  1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
+        }
+        else {
+          slopeRow  = 1;
+          slopeCol  = -1;
+          rowOffset = 0;
+          colOffset = (16-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+      }
+
+    }
+    else { // fpix
+      if (side==-1) { // pannel 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        }
+      }
+      else { // pannel 2
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        }
+
+      } // side
+
+    }
+
+    uint32_t gRow = rowOffset+slopeRow*local.row;
+    uint32_t gCol = colOffset+slopeCol*local.col;
+    //printf("Inside frameConversion row: %u, column: %u\n", gRow, gCol);
+    pixelgpudetails::Pixel global = {gRow, gCol};
+    return global;
+  }
+
+  ALPAKA_FN_HOST_ACC
+  uint8_t conversionError(uint8_t fedId, uint8_t status, bool debug = false)
+  {
+    // debug = true;
+
+    if(debug) {
+      switch (status) {
+      case(1) : {
+        printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId );
+        break;
+      }
+      case(2) : {
+        printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
+        break;
+      }
+      case(3) : {
+        printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
+        break;
+      }
+      case(4) : {
+        printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
+        break;
+      }
+      default:
+        if (debug) printf("Cabling check returned unexpected result, status = %i\n", status);
+      };
+    }
+
+    if(status >= 1 and status <= 4) {
+      return status + 34;
+    }
+    return 0;
+  }
+
+  ALPAKA_FN_HOST_ACC
+  uint32_t getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false)
+  {
+    uint32_t rID = 0xffffffff;
+
+    switch (errorType) {
+      case 25 : case 30 : case 31 : case 36 : case 40 : {
+        //set dummy values for cabling just to get detId from link
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        constexpr uint32_t roc  = 1;
+        const uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        const uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if (rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      case 29 : {
+        int chanNmbr = 0;
+        constexpr int DB0_shift = 0;
+        constexpr int DB1_shift = DB0_shift + 1;
+        constexpr int DB2_shift = DB1_shift + 1;
+        constexpr int DB3_shift = DB2_shift + 1;
+        constexpr int DB4_shift = DB3_shift + 1;
+        constexpr uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
+
+        const int CH1 = (errWord >> DB0_shift) & DataBit_mask;
+        const int CH2 = (errWord >> DB1_shift) & DataBit_mask;
+        const int CH3 = (errWord >> DB2_shift) & DataBit_mask;
+        const int CH4 = (errWord >> DB3_shift) & DataBit_mask;
+        const int CH5 = (errWord >> DB4_shift) & DataBit_mask;
+        constexpr int BLOCK_bits      = 3;
+        constexpr int BLOCK_shift     = 8;
+        constexpr uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
+        const int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
+        const int localCH = 1*CH1+2*CH2+3*CH3+4*CH4+5*CH5;
+        if (BLOCK%2==0) chanNmbr=(BLOCK/2)*9+localCH;
+        else chanNmbr = ((BLOCK-1)/2)*9+4+localCH;
+        if ((chanNmbr < 1)||(chanNmbr > 36)) break;  // signifies unexpected result
+
+        // set dummy values for cabling just to get detId from link if in Barrel
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        constexpr uint32_t roc  = 1;
+        const uint32_t link = chanNmbr;
+        const uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      case 37 : case 38: {
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        const uint32_t roc  = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask;
+        const uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        const uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      default:
+        break;
+    };
+
+    return rID;
+  }
+  
+  ALPAKA_FN_HOST_ACC
+  uint8_t checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false)
+  {
+    uint8_t errorType = (errorWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ERROR_mask;
+    if (errorType < 25) return 0;
+    bool errorFound = false;
+
+    switch (errorType) {
+      case(25) : {
+        errorFound = true;
+        uint32_t index = fedId * pixelgpudetails::MAX_LINK * pixelgpudetails::MAX_ROC + (link-1) * pixelgpudetails::MAX_ROC + 1;
+        if (index > 1 && index <= cablingMap->size) {
+          if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index])) errorFound = false;
+        }
+        if (debug and errorFound) printf("Invalid ROC = 25 found (errorType = 25)\n");
+        break;
+      }
+      case(26) : {
+        if (debug) printf("Gap word found (errorType = 26)\n");
+        errorFound = true;
+        break;
+      }
+      case(27) : {
+        if (debug) printf("Dummy word found (errorType = 27)\n");
+        errorFound = true;
+        break;
+      }
+      case(28) : {
+        if (debug) printf("Error fifo nearly full (errorType = 28)\n");
+        errorFound = true;
+        break;
+      }
+      case(29) : {
+        if (debug) printf("Timeout on a channel (errorType = 29)\n");
+        if ((errorWord >> pixelgpudetails::OMIT_ERR_shift) & pixelgpudetails::OMIT_ERR_mask) {
+          if (debug) printf("...first errorType=29 error, this gets masked out\n");
+        }
+        errorFound = true;
+        break;
+      }
+      case(30) : {
+        if (debug) printf("TBM error trailer (errorType = 30)\n");
+        int StateMatch_bits = 4;
+        int StateMatch_shift = 8;
+        uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
+        int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
+        if ( StateMatch != 1 && StateMatch != 8 ) {
+          if (debug) printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
+        }
+        if (StateMatch == 1) errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30
+        errorFound = true;
+        break;
+      }
+      case(31) : {
+        if (debug) printf("Event number error (errorType = 31)\n");
+        errorFound = true;
+        break;
+      }
+      default:
+        errorFound = false;
+    };
+
+    return errorFound ? errorType : 0;
+  }
+
+
+  struct rawtodigi_kernel {
+
+  template <typename T_Acc>
+  ALPAKA_FN_ACC
+  void operator()(T_Acc const& acc, const Input *input, Output *output,
+      bool useQualityInfo, bool includeErrors, bool debug) const
+  {
+    const SiPixelFedCablingMapGPU* cablingMap = &input->cablingMap;
+    const uint32_t wordCounter = input->wordCounter;
+    const uint32_t* word = input->word;
+    const uint8_t* fedIds =input->fedId;
+    uint16_t* xx = output->xx;
+    uint16_t* yy = output->yy;
+    uint16_t* adc = output->adc;
+    uint32_t* pdigi = output->digi;
+    uint32_t* rawIdArr = output->rawIdArr;
+    uint16_t* moduleId = output->moduleInd;
+    GPU::SimpleVector<PixelErrorCompact>* err = &output->err;
+
+
+    int32_t first = threadIdx.x + blockIdx.x*blockDim.x;
+    for (int32_t iloop=first, nend=wordCounter; iloop<nend; iloop+=blockDim.x*gridDim.x) { 
+
+        auto gIndex  = iloop;
+        xx[gIndex]   = 0;
+        yy[gIndex]   = 0;
+        adc[gIndex]  = 0;
+        bool skipROC = false;
+
+        uint8_t fedId = fedIds[gIndex/2]; // +1200;
+
+        // initialize (too many coninue below)
+        pdigi[gIndex]  = 0;
+        rawIdArr[gIndex] = 0;
+        moduleId[gIndex] = 9999;
+
+        uint32_t ww = word[gIndex]; // Array containing 32 bit raw data
+        if (ww == 0) {
+          // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
+          continue;
+        }
+
+        uint32_t link  = getLink(ww);            // Extract link
+        uint32_t roc   = getRoc(ww);             // Extract Roc in link
+        pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
+
+        uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug);
+        skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
+        if (includeErrors and skipROC)
+        {
+          uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
+          err->push_back(acc, PixelErrorCompact{rID, ww, errorType, fedId});
+          continue;
+        }
+
+        uint32_t rawId  = detId.RawId;
+        uint32_t rocIdInDetUnit = detId.rocInDet;
+        bool barrel = isBarrel(rawId);
+
+        uint32_t index = fedId * pixelgpudetails::MAX_LINK * pixelgpudetails::MAX_ROC + (link-1) * pixelgpudetails::MAX_ROC + roc;
+        if (useQualityInfo) {
+          skipROC = cablingMap->badRocs[index];
+          if (skipROC) continue;
+        }
+
+        uint32_t layer = 0;//, ladder =0;
+        int side = 0, panel = 0, module = 0;//disk = 0, blade = 0
+
+        if (barrel)
+        {
+          layer  = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
+          module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
+          side   = (module < 5)? -1 : 1;
+        }
+        else {
+          // endcap ids
+          layer = 0;
+          panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
+          //disk  = (rawId >> diskStartBit_) & diskMask_;
+          side  = (panel == 1)? -1 : 1;
+          //blade = (rawId >> bladeStartBit_) & bladeMask_;
+        }
+
+        // ***special case of layer to 1 be handled here
+        pixelgpudetails::Pixel localPix;
+        if (layer == 1) {
+          uint32_t col = (ww >> pixelgpudetails::COL_shift) & pixelgpudetails::COL_mask;
+          uint32_t row = (ww >> pixelgpudetails::ROW_shift) & pixelgpudetails::ROW_mask;
+          localPix.row = row;
+          localPix.col = col;
+          if (includeErrors) {
+            if (not rocRowColIsValid(row, col)) {
+              uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
+              err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId});
+              if(debug) printf("BPIX1  Error status: %i\n", error);
+              continue;
+            }
+          }
+        } else {
+          // ***conversion rules for dcol and pxid
+          uint32_t dcol = (ww >> pixelgpudetails::DCOL_shift) & pixelgpudetails::DCOL_mask;
+          uint32_t pxid = (ww >> pixelgpudetails::PXID_shift) & pixelgpudetails::PXID_mask;
+          uint32_t row  = pixelgpudetails::numRowsInRoc - pxid/2;
+          uint32_t col  = dcol*2 + pxid%2;
+          localPix.row = row;
+          localPix.col = col;
+          if (includeErrors and not dcolIsValid(dcol, pxid)) {
+            uint8_t error = conversionError(fedId, 3, debug);
+            err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId});
+            if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
+            continue;
+          }
+        }
+
+        pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
+        xx[gIndex]    = globalPix.row;  // origin shifting by 1 0-159
+        yy[gIndex]    = globalPix.col;  // origin shifting by 1 0-415
+        adc[gIndex]   = getADC(ww);
+        pdigi[gIndex] = pack(globalPix.row, globalPix.col, adc[gIndex]);
+        moduleId[gIndex] = detId.moduleId;
+        rawIdArr[gIndex] = rawId;
+    } // end of loop (gIndex < end)
+
+  } // end of Raw to Digi kernel
+
+  };
+
+  void rawtodigi(const Input *input_d, Output *output_d,
+                 const uint32_t wordCounter,
+                 bool useQualityInfo, bool includeErrors, bool debug, cuplaStream_t stream) {
+    const int threadsPerBlock = 512;
+    const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
+
+    CUPLA_KERNEL_OPTI(rawtodigi_kernel)(blocks, threadsPerBlock, 0, stream)(input_d,
+                                                             output_d,
+                                                             useQualityInfo,
+                                                             includeErrors,
+                                                             debug);
+    cuplaCheck(cuplaGetLastError());
+  }     
+  
+} // end namespace cupla
diff --git a/rawtodigi_cupla.h b/rawtodigi_cupla.h
new file mode 100644
index 0000000..6c3f821
--- /dev/null
+++ b/rawtodigi_cupla.h
@@ -0,0 +1,24 @@
+#ifndef RAWTODIGI_CUPLA_H
+#define RAWTODIGI_CUPLA_H
+
+/* Do NOT include other headers that use CUDA runtime functions or variables
+ * (see above) before this include.
+ * The reason for this is that cupla renames CUDA host functions and device build in
+ * variables by using macros and macro functions.
+ * Do NOT include other specific includes such as `<cuda.h>` (driver functions,
+ * etc.).
+ */
+#include <cuda_to_cupla.hpp>
+
+#include "pixelgpudetails.h"
+#include "GPUSimpleVector.h"
+#include "input.h"
+#include "output.h"
+
+namespace cupla {
+  void rawtodigi(const Input *input_d, Output *output_d,
+                 const uint32_t wordCounter,
+                 bool useQualityInfo, bool includeErrors, bool debug, cudaStream_t stream);
+}
+
+#endif