diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml
index 19024b9bc2b1a..75f3d15738429 100644
--- a/CUDADataFormats/BeamSpot/BuildFile.xml
+++ b/CUDADataFormats/BeamSpot/BuildFile.xml
@@ -1,8 +1,7 @@
+
-
-
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 9e44bfdf35969..907b7647a3452 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -1,11 +1,12 @@
#ifndef CUDADataFormatsCommonHeterogeneousSoA_H
#define CUDADataFormatsCommonHeterogeneousSoA_H
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
// a heterogeneous unique pointer...
template
diff --git a/HeterogeneousCore/CUDACore/interface/GPUCuda.h b/HeterogeneousCore/CUDACore/interface/GPUCuda.h
deleted file mode 100644
index a09fb4bd4f60f..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/GPUCuda.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef HeterogeneousCore_CUDAServices_GPUCuda_h
-#define HeterogeneousCore_CUDAServices_GPUCuda_h
-
-#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "HeterogeneousCore/Producer/interface/DeviceWrapper.h"
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include
-
-#include
-
-namespace heterogeneous {
- class GPUCuda {
- public:
- using CallbackType = std::function;
-
- explicit GPUCuda(const edm::ParameterSet& iConfig);
- virtual ~GPUCuda() noexcept(false);
-
- void call_beginStreamGPUCuda(edm::StreamID id);
- bool call_acquireGPUCuda(DeviceBitSet inputLocation,
- edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder);
- void call_produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup);
-
- static void fillPSetDescription(edm::ParameterSetDescription& desc);
-
- private:
- virtual void beginStreamGPUCuda(edm::StreamID id, cuda::stream_t<>& cudaStream){};
- virtual void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- cuda::stream_t<>& cudaStream) = 0;
- virtual void produceGPUCuda(edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- cuda::stream_t<>& cudaStream) = 0;
-
- std::unique_ptr> cudaStream_;
- int deviceId_ = -1; // device assigned to this edm::Stream
- bool enabled_;
- const bool forced_;
- };
- DEFINE_DEVICE_WRAPPER(GPUCuda, HeterogeneousDevice::kGPUCuda);
-} // namespace heterogeneous
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
deleted file mode 100644
index 706b9e6f9a86c..0000000000000
--- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-#include "chooseCUDADevice.h"
-#include
-
-#include
-
-namespace heterogeneous {
- GPUCuda::GPUCuda(const edm::ParameterSet& iConfig)
- : enabled_(iConfig.getUntrackedParameter("GPUCuda")),
- forced_(iConfig.getUntrackedParameter("force") == "GPUCuda") {
- if (forced_ && !enabled_) {
- throw cms::Exception("Configuration")
- << "It makes no sense to force the module on GPUCuda, and then disable GPUCuda.";
- }
- }
-
- GPUCuda::~GPUCuda() noexcept(false) {}
-
- void GPUCuda::fillPSetDescription(edm::ParameterSetDescription& desc) { desc.addUntracked("GPUCuda", true); }
-
- void GPUCuda::call_beginStreamGPUCuda(edm::StreamID id) {
- edm::Service cudaService;
- enabled_ = (enabled_ && cudaService->enabled());
- if (!enabled_) {
- if (forced_) {
- throw cms::Exception("LogicError")
- << "This module was forced to run on GPUCuda, but the device is not available.";
- }
- return;
- }
-
- // For startes we "statically" assign the device based on
- // edm::Stream number. This is suboptimal if the number of
- // edm::Streams is not a multiple of the number of CUDA devices
- // (and even then there is no load balancing).
- //
- // TODO: improve. Possible ideas include
- // - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
- // * the first module of a chain dictates the device for the rest of the chain
- // - our own CUDA memory allocator
- // * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
- // * would probably still need some buffer space/device to hold e.g. conditions data
- // - for conditions, how to handle multiple lumis per job?
- deviceId_ = id % cudaService->numberOfDevices();
-
- cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
-
- // Create the CUDA stream for this module-edm::Stream pair
- auto current_device = cuda::device::current::get();
- cudaStream_ = std::make_unique>(
- current_device.create_stream(cuda::stream::no_implicit_synchronization_with_default_stream));
-
- beginStreamGPUCuda(id, *cudaStream_);
- }
-
- bool GPUCuda::call_acquireGPUCuda(DeviceBitSet inputLocation,
- edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- if (!enabled_) {
- return false;
- }
-
- // TODO: currently 'forced_ == true' is already assumed. When the
- // scheduling logic evolves, add explicit treatment of forced_.
-
- cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
-
- try {
- iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, 0));
- acquireGPUCuda(iEvent, iSetup, *cudaStream_);
- cudaStream_->enqueue.callback(
- [deviceId = deviceId_,
- waitingTaskHolder, // copy needed for the catch block
- locationSetter = iEvent.locationSetter()](cuda::stream::id_t streamId, cuda::status_t status) mutable {
- if (status == cudaSuccess) {
- locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId));
- LogTrace("GPUCuda") << " GPU kernel finished (in callback) device " << deviceId << " CUDA stream "
- << streamId;
- waitingTaskHolder.doneWaiting(nullptr);
- } else {
- // wrap the exception in a try-catch block to let GDB "catch throw" break on it
- try {
- auto error = cudaGetErrorName(status);
- auto message = cudaGetErrorString(status);
- throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId
- << " error " << error << ": " << message;
- } catch (cms::Exception&) {
- waitingTaskHolder.doneWaiting(std::current_exception());
- }
- }
- });
- } catch (...) {
- waitingTaskHolder.doneWaiting(std::current_exception());
- }
- return true;
- }
-
- void GPUCuda::call_produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
- // I guess we have to assume that produce() may be called from a different thread than acquire() was run
- // The current CUDA device is a thread-local property, so have to set it here
- cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
-
- produceGPUCuda(iEvent, iSetup, *cudaStream_);
- }
-} // namespace heterogeneous
diff --git a/HeterogeneousCore/Producer/BuildFile.xml b/HeterogeneousCore/Producer/BuildFile.xml
deleted file mode 100644
index b75ff104fc2d9..0000000000000
--- a/HeterogeneousCore/Producer/BuildFile.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
-
-
-
-
-
diff --git a/HeterogeneousCore/Producer/README.md b/HeterogeneousCore/Producer/README.md
deleted file mode 100644
index 595dec70e1006..0000000000000
--- a/HeterogeneousCore/Producer/README.md
+++ /dev/null
@@ -1,468 +0,0 @@
-# Prototype for CMSSW interface to heterogenous algorithms
-
-## Introduction
-
-This package contains a prtotype for the CMSSW interface to
-heterogeneous algorithms. The current implementation is, in a sense, a
-mini-framework between the CMSSW core framework and the heterogeneous
-algorithms.
-
-More details can be found from the sub-package specific README files (when they get added).
-
-## Sub-packages
-
-* [`CUDACore`](../CUDACore) CUDA-specific core components
- - *TODO:* Do we actually need this separate from `CUDAServices`? Which one to keep?
-* [`CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
-* [`CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
-* [`Producer`](#heterogeneousedproducer) Core of the mini-framework for code organization: a base EDProducer class with algorithm scheduling to devices
-* [`Product`](../Product) Core of the mini-framework for data products
-
-## Design goals
-
-1. Same module configuration should work on all machines (whether they have heterogeneous devices or not)
-2. Keep track of where data is located
-3. Run algorithms on the device where their input is located if possible
-4. Transfer temporary/transient data to CPU only if needed
-5. Abstract away repeated boilerplate code
-
-## Design considerations
-
-All below is assuming we do not touch the core CMSSW framework (that
-is left for a later exercise when we know better what exactly we want
-to do).
-
-1. The device-specific algorithms must be implemented and scheduled to the device(s) within a single EDProducer
-2. Need a special product keeping track of the data location
-3. Information of all the input heterogeneous devices must be propagated to the point deciding the device to run the algorithm
-4. The special product e.g. holds functions to do the transfer that are called if/when needed
-
-## General TODO items
-
-There are also many, more specific TODO items mentioned in comments
-within the code. The items below are more general topics (in no
-particular order).
-
-* Improve algorithm-to-device scheduling
- - Currently if an algoritm has a GPU implementation and system has a
- GPU, the algorithm is always scheduled to the GPU
- * This may lead to under-utilization of the CPU if most of the
- computations are offloaded to the GPU
- - An essential question for making this scheduling more dynamic is
- what exactly (we want) it (to) means that a "GPU is too busy" so
- it is better to run the algorithm on a CPU
- - Possible ideas to explore
- * Check the CUDA device utilization (see also monitoring point below)
- - Past/current state does not guarantee much about the near future
- * Use "tokens" as a resource allocation mechanism
- - How many tokens per device?
- - What if there no free tokens now but one becomes available after 1 ms?
- * In acquire, if GPU is "busy", put the EDProducer to a queue of
- heterogeneous tasks. When GPU "becomes available", pick an
- EDProducer from the queue and run it in GPU. If CPU runs out of
- job, pick an EDProducer from the queue and run it in CPU.
- - How to define "busy" and "becomes available"?
- - How to become aware that CPU runs out of job?
- * Can we create a TBB task that is executed only if there is nothing else to do?
- - How does this interact with possible other devices? E.g. if an algorithm has implementations for CPU, GPU, and FPGA?
-* Improve edm::Stream-to-CUDA-device scheduling
- - Currently each edm::Stream is assigned "statically" to each CUDA device in a round-robin fastion
- * There is no load balancing so throughput and utilization will not be optimal
- - The reasons for bothering with this is that the `cudaMalloc` is a
- heavy operation (see next point) not to be called for each event.
- Instead we preallocate the buffers the algorithms need at the
- initialization time. In the presence of multiple devices this
- pre-allocation leads to a need to somehow match the edm::Streams
- and devices.
- - Possible ideas to explore
- * Naively we could allocate a buffer per edm::Stream in each CUDA device
- - Amount of allocated memory is rather excessive
- * For N edm::Streams and M GPUs, allocate int((N+1)/M) buffers on each device, eventually pick the least utilized GPU
- - No unnecessary buffers
- - Need a global list/queue of these buffers per module
- * Can the list be abstracted? If not, this solution scales poorly with modules
- * Our own CUDA memory allocator that provides a fast way to allocate scratch buffers
- - Allows allocating the buffers on-demand on the "best-suited" device
-* Our own CUDA memory allocator
- - A `cudaMalloc` is a global synchronization point and takes time,
- so we want to minimize their calls. This is the main reason to
- assign edm::Streams to CUDA devices (see previous point).
- - Well-performing allocators are typically highly non-trivial to construct
-* Conditions data on GPU
- - Currently each module takes care of formatting, transferring, and updating the conditions data to GPU
- - This is probably good-enough for the current prototyping phase, but what about longer term?
- * How to deal with multiple devices, multiple edm::Streams, and multiple lumi sections in flight?
- * Do we need to make EventSetup aware of the devices? How much do the details depend on device type?
-* Add possibility to initiate the GPU->CPU transfer before the CPU product is needed
- - This would enable overlapping the GPU->CPU transfer while CPU is busy
- with other work, so the CPU product requestor would not have to wait
-* Improve configurability
- - E.g. for preferred device order?
-* Add fault tolerance
- - E.g. in a case of a GPU running out of memory continue with CPU
- - Should be configurable
-* Add support for multiple heterogeneous inputs for a module
- - Currently the device scheduling is based only on the "first input"
- - Clearly this is inadequate in general and needs to be improved
- - Any better suggestions than taking `and` of all locations?
-* Improve resource monitoring
- - E.g. for CUDA device utilization
- * https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g540824faa6cef45500e0d1dc2f50b321
- * https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t
- - Include in `CUDAService` or add a separete monitoring service?
-* Add support for a mode similar to `tbb::streaming_node`
- - Current way of `HeterogeneousEDProducer` with the `ExternalWork` resembles `tbb::async_node`
- - In principle the `streaming_node`-way would, for a chain of
- GPU-enabled modules, allow the GPU to immediately continue to the
- next module without waiting the code path to go through the CMSSW
- framework
-* Add support for more devices
- - E.g. OpenCL, FPGA, remote offload
-* Explore the implementation of these features into the core CMSSW framework
- - E.g. HeterogeneousProduct would likely go to edm::Wrapper
-* Explore how to make core framework/TBB scheduling aware of heterogenous devices
-
-# HeterogeneousEDProducer
-
-`HeterogeneousEDProducer` is implemented as a `stream::EDProducer` using the
-[`ExternalWork` extension](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork).
-
-## Configuration
-
-`HeterogeneousEDProducer` requires `heterogeneousEnabled_` `cms.PSet`
-to be available in the module configuration. The `PSet` can be used to
-override module-by-module which devices can be used by that module.
-The structure of the `PSet` are shown in the following example
-
-```python
-process.foo = cms.EDModule("FooProducer",
- a = cms.int32(1),
- b = cms.VPSet(...),
- ...
- heterogeneousEnabled_ = cms.untracked.PSet(
- GPUCuda = cms.untracked.bool(True),
- FPGA = cms.untracked.bool(False), # forbid the module from running on a device type (note that we don't support FPGA devices at the moment though)
- force = cms.untracked.string("GPUCuda") # force the module to run on a specific device type
- )
-)
-```
-
-The difference between the boolean flags and the `force` parameter is the following
-* The boolean flags control whether the algorithm can be scheduled on the individual device type or not
-* The `force` parameter implies that the algorithm is always scheduled on that device type no matter what. If the device type is not available on the machine, an exception is thrown.
-
-Currently, with only CUDA GPU and CPU support, this level of configurability is a bit overkill though.
-
-## Class declaration
-
-In order to use the `HeterogeneousEDProducer` the `EDProducer` class
-must inherit from `HeterogeneousEDProducer<...>`. The devices, which
-the `EDProducer` is supposed to support, are given as a template
-argument via `heterogeneous::HeterogeneousDevices<...>`. The usual
-[stream producer extensions](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#Template_Arguments)
-can be also passed via additional template arguments.
-
-```cpp
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" // needed for heterogeneous::GPUCuda
-
-class FooProducer: public HeterogeneousEDProducer<
- heterogeneous::HeterogeneousDevices<
- heterogeneous::GPUCuda,
- heterogeneous::CPU
- >
- // here you can pass any stream producer extensions
-> {
- ...
-
-```
-
-In this example the `FooProducer` declares that prodives
-implementations for CUDA GPU and CPU. Note that currently CPU is
-mandatory, and it has to be the last argument. The order of the
-devices dictates the order that the algorithm is scheduled to the
-devices. E.g. in this example, the system runs the algorithm in GPU if
-it can, and only if it can not, in CPU. For the list of supported
-device types, see the [list below](#devices).
-
-## Constructor
-
-`HeterogeneousEDProducer` needs access to the configuration, so it has
-to be passed to its constructor as in the following example
-
-```cpp
-FooProducer::FooProducer(edm::ParameterSet const& iConfig):
- HeterogeneousEDProducer(iConfig),
- ...
-```
-### Consumes
-
-If the `EDProducer` reads any `HeterogeneousProduct`'s
-([see more details](#heterogeneousproduct)), the `consumes()` call
-should be made along the following
-
-```cpp
-class FooProducer ... {
- ...
- EDGetTokenT token_;
-};
-...
-FooProducer::FooProducer(edm::ParameterSet const& iConfig):
- ...
- token_(consumesHeterogeneous(iConfig.getParameter("..."))),
- ...
-```
-
-so that `HeterogeneousEDProducer` can inspect the location of input
-heterogeneous products to decide on which device to run the algorithm
-([see more details](#device-scheduling)).
-
-### Produces
-
-If the `EDProducer` produces any `HeterogeneousProduct`'s
-([see more details](#heterogeneousproduct)), the `produces()` call
-should be made along the following (i.e. as usual)
-
-```cpp
-FooProducer::FooProducer(edm::ParameterSet const& iConfig) ... {
- ...
- produces();
-}
-```
-
-## fillDescriptions()
-
-`HeterogeneousEDProducer` provides a `fillPSetDescription()` function
-that can be called from the concrete `EDProducer`'s
-`fillDescriptions()` as in the following example
-
-```cpp
-void FooProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
- edm::ParameterSetDescription desc;
-
- // fill desc for other parameters
-
- HeterogeneousEDProducer::fillPSetDescription(desc);
-
- descriptions.add("fooProducer", desc);
-}
-```
-
-## Device scheduling
-
-The per-event scheduling of algorithms is currently as follows
-1. If there are no `HeterogeneousProduct` inputs ([see more details](#heterogeneousproduct))
- * Loop over the device types in the order specified in `heterogeneous::HeterogeneousDevices<...>` template arguments
- * Run the algorithm on the first device that is enabled for that module (instance)
-2. If there are `HeterogeneousProduct` inputs
- * Run the algorithm on **the device where the data of the first input resides**
-
-## HeterogeneousProduct
-
-The `HeterogeneousProduct` is a transient edm product with the following properties
-* placeholder for products (of arbitrary types) in all device types
-* tracks the location of the data
-* automatic, on-demand transfers from device to CPU
- - developer has to provide a function to do the transfer and possible data reorganiazation
-
-Some of the complexity exists to avoid ROOT dictionary generation of the concrete product types.
-
-## HeterogeneousEvent
-
-The `HeterogeneousEvent` is a wrapper on top of `edm::Event` to hide
-most of the complexity of `HeterogeneousProduct` to make its use look
-almost like standard products with `edm::Event`. Some part of the
-`edm::Event` interface is implemented (and delegated back to
-`edm::Event`) in order to get/put standard products as well.
-
-Here is a short example how to deal use `HeterogeneousProduct` with
-`HeterogeneousEvent` (using the same `FooProducer` example as before)
-
-```cpp
-class FooProducer ... {
- ...
- // in principle these definitions should be treated like DataFormats
- struct CPUProduct {
- std::vector foo;
- };
- struct GPUProduct {
- float *foo_d; // pointer to GPU memory
- }
-
- using InputType = HeterogeneousProductImpl,
- heterogeneous::GPUProduct>;
- using OutputType = InputType; // using same input and output only because of laziness
-
- void transferGPUtoCPU(GPUProduct const& gpu, CPUProduct& cpu) const;
-};
-
-void FooProducer::produceCPU(edm::HeterogeneousEvent const& iEvent, ...) {
- edm::Handle hinput;
- iEvent.getByToken(token_, hinput); // note the InputType template argument
-
- // do whatever you want with hinput->foo;
-
- auto output = std::make_unique(...);
- iEvent.put(std::move(output)); // note the OutputType template argument
-}
-
-void FooProducer::acquireGPUCuda(edm::HeterogeneousEvent const& iEvent, ...) {
- edm::Handle hinput;
- iEvent.getByToken(token_, hinput); // note the InputType template argument
-
- // do whatever you want with hinput->foo_d;
-
- auto output = std::make_unique(...);
- // For non-CPU products, a GPU->CPU transfer function must be provided
- // In this example it is prodided as a lambda calling a member function, but this is not required
- // The function can be anything assignable to std::function
- iEvent.put(std::move(output), [this](GPUProduct const& gpu, CPUProduct& cpu) { // note the OutputType template argument
- this->transferGPUtoCPU(gpu, cpu);
- });
- // It is also possible to disable the GPU->CPU transfer
- // If the data resides on a GPU, and the corresponding CPU product is requested, an exception is thrown
- //iEvent.put(std::move(output), heterogeneous::DisableTransfer); // note the OutputType template argument
-}
-
-```
-
-
-## Devices
-
-This section documents which functions the `EDProducer` can/has to
-implement for various devices.
-
-### CPU
-
-A CPU implementation is declared by giving `heterogeneous::CPU` as a
-template argument to `heterogeneous::HeterogeneousDevices`. Currently
-it is a mandatory argument, and has to be the last one (i.e. there
-must always be a CPU implementation, which is used as the last resort
-if there are no other devices).
-
-#### Optional functions
-
-There is one optional function
-
-```cpp
-void beginStreamCPU(edm::StreamID id);
-```
-
-which is called at the beginning of an `edm::Stream`. Usually there is
-no need to implement it, but the possibility is provided in case it is
-needed for something (as the `stream::EDProducer::beginStream()` is
-overridden by `HeterogeneousEDProducer`).
-
-#### Mandatory functions
-
-There is one mandatory function
-
-```cpp
-void produceCPU(edm::HeterogeneousEvent& iEvent, edm::EventSetup const& iSetup);
-```
-
-which is almost equal to the usual `stream::EDProducer::produce()`
-function. It is called from `HeterogeneousEDProducer::produce()` if
-the device scheduler decides that the algorithm should be run on CPU
-([see more details](#device-scheduling)). The first argument is
-`edm::HeterogeneousEvent` instead of the usual `edm::Event`
-([see more details](#heterogeneousevent)).
-
-The function should read its input, run the algorithm, and put the
-output to the event.
-
-### CUDA GPU
-
-A CUDA GPU implementation is declared by giving
-`heterogeneous::GPUCuda` as a template argument to
-`heterogeneous::HeterogeneousDevices`. The following `#include` is
-also needed
-```cpp
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-```
-#### Optional functions
-
-There is one optional function
-
-```cpp
-void beginStreamGPUCuda(edm::StreamID id, cuda::stream_t<>& cudaStream);
-```
-
-which is called at the beginning of an `edm::Stream`. **If the
-algorithm has to allocate memory buffers for the duration of the whole
-job, the recommended place is here.** The current CUDA device is set
-by the framework before the call, and all asynchronous tasks should be
-enqueued to the CUDA stream given as an argument.
-
-Currently the very same CUDA stream object will be given to the
-`acquireGPUCuda()` and `produceGPUCuda()` for this `edm::Stream`. This
-may change in the future though, in which case it will still be
-guaranteed that the CUDA stream here will be synchronized before the
-first call to `acquireGPUCuda()`.
-
-#### Mandatory functions
-
-There are two mandatory functions:
-
-```cpp
-void acquireGPUCuda(edm::HeterogeneousEvent const& iEvent, edm::EventSetup const& iSetup, cuda::stream_t<>& cudaStream);
-```
-
-The `acquireGPUCuda()` is called from
-`HeterogeneousEDProducer::acquire()` if the device scheduler devices
-that the algorithm should be run on a CUDA GPU
-([see more details](#device-scheduling)). The function should read
-the necessary input (which may possibly be already on a GPU,
-([see more details](#heterogeneousproduct)), and enqueue the
-*asynchronous* work on the CUDA stream given as an argument. The
-current CUDA deviceis set by the framework before the call. After the
-`acquireGPUCuda()` returns, framework will itself enqueue a callback
-function to the CUDA stream that will call
-`edm::WaitingTaskWithArenaHolder::doneWaiting()` to signal to the
-framework that this `EDProducer` is ready to transition to
-`produce()`.
-
-Currently the very same CUDA stream will be given to the
-`produceGPUCuda()`.
-
-
-```cpp
-void produceGPUCuda(edm::HeterogeneousEvent& iEvent, edm::EventSetup const& iSetup, cuda::stream_t<>& cudaStream);
-```
-
-The `produceGPUCuda()` is called from
-`HeterogeneousEDProducer::produce()` if the algorithm was run on a
-CUDA GPU. The function should do any necessary GPU->CPU transfers,
-post-processing, and put the products to the event (for passing "GPU
-products" [see here](#heterogeneousproduct)).
-
-#### Memory allocations
-
-The `cudaMalloc()` is somewhat heavy function (synchronizing the whole
-device, among others). The current strategy (although not enforced by
-the framework) is to allocate memory buffers at the beginning of a
-job. It is recommended to do these allocations in the
-`beginStreamGPUCuda()`, as it is called exactly once per job per
-`stream::EDProducer` instance, and it is the earliest point in the
-framework where we have the concept of `edm::Stream` so that the
-framework can assign the `edm::Stream`s to CUDA devices
-([see more details](#multipledevices)).
-
-Freeing the GPU memory can be done in the destructor as it does not
-require any special support from the framework.
-
-#### Multiple devices
-
-Currently `edm::Stream`'s are statically assigned to CUDA devices in a
-round-robin fashion. The assignment is done at the `beginStream()`
-time before calling the `EDProducer` `beginStreamDevice()` functions.
-
-Technically "assigning `edm::Stream`" means that each relevant
-`EDProducer` instance of that `edm::Stream` will hold a device id of
-`streamId % numberOfDevices`.
-
-### Mock GPU
-
-The `GPUMock` is intended only for testing of the framework as a
-something requiring a GPU-like interface but still ran on the CPU. The
-documentation is left to be the code itself.
diff --git a/HeterogeneousCore/Producer/interface/DeviceWrapper.h b/HeterogeneousCore/Producer/interface/DeviceWrapper.h
deleted file mode 100644
index 3bcb4f83c6c73..0000000000000
--- a/HeterogeneousCore/Producer/interface/DeviceWrapper.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef HeterogeneousCore_Producer_DeviceWrapper_h
-#define HeterogeneousCore_Producer_DeviceWrapper_h
-
-namespace heterogeneous {
- template
- struct Mapping;
-}
-
-#define DEFINE_DEVICE_WRAPPER(DEVICE, ENUM) \
- template <> \
- struct Mapping { \
- template \
- static void beginStream(DEVICE& algo, Args&&... args) { \
- algo.call_beginStream##DEVICE(std::forward(args)...); \
- } \
- template \
- static bool acquire(DEVICE& algo, Args&&... args) { \
- return algo.call_acquire##DEVICE(std::forward(args)...); \
- } \
- template \
- static void produce(DEVICE& algo, Args&&... args) { \
- algo.call_produce##DEVICE(std::forward(args)...); \
- } \
- static constexpr HeterogeneousDevice deviceEnum = ENUM; \
- }
-
-#endif
diff --git a/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h b/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h
deleted file mode 100644
index ea9e6e9f7a210..0000000000000
--- a/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef HeterogeneousCore_Producer_HeterogeneousEDProducer_h
-#define HeterogeneousCore_Producer_HeterogeneousEDProducer_h
-
-#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-#include "DataFormats/Common/interface/Handle.h"
-
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-#include "HeterogeneousCore/Producer/interface/DeviceWrapper.h"
-
-namespace heterogeneous {
- class CPU {
- public:
- explicit CPU(const edm::ParameterSet& iConfig) {}
- virtual ~CPU() noexcept(false);
-
- static void fillPSetDescription(edm::ParameterSetDescription desc) {}
-
- void call_beginStreamCPU(edm::StreamID id) { beginStreamCPU(id); }
- bool call_acquireCPU(edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder);
- void call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup);
-
- private:
- virtual void beginStreamCPU(edm::StreamID id){};
- virtual void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0;
- };
- DEFINE_DEVICE_WRAPPER(CPU, HeterogeneousDevice::kCPU);
-
- class GPUMock {
- public:
- explicit GPUMock(const edm::ParameterSet& iConfig);
- virtual ~GPUMock() noexcept(false);
-
- static void fillPSetDescription(edm::ParameterSetDescription& desc);
-
- void call_beginStreamGPUMock(edm::StreamID id) { beginStreamGPUMock(id); }
- bool call_acquireGPUMock(DeviceBitSet inputLocation,
- edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder);
- void call_produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
- produceGPUMock(iEvent, iSetup);
- }
-
- private:
- virtual void beginStreamGPUMock(edm::StreamID id){};
- virtual void acquireGPUMock(const edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- std::function callback) = 0;
- virtual void produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0;
-
- const bool enabled_;
- const bool forced_;
- };
- DEFINE_DEVICE_WRAPPER(GPUMock, HeterogeneousDevice::kGPUMock);
-} // namespace heterogeneous
-
-namespace heterogeneous {
- ////////////////////
- template
- struct CallBeginStream;
- template
- struct CallBeginStream {
- template
- static void call(T& ref, Args&&... args) {
- // may not perfect-forward here in order to be able to forward arguments to next CallBeginStream.
- Mapping::beginStream(ref, args...);
- CallBeginStream::call(ref, std::forward(args)...);
- }
- };
- // break recursion and require CPU to be the last
- template
- struct CallBeginStream {
- template
- static void call(T& ref, Args&&... args) {
- Mapping::beginStream(ref, std::forward(args)...);
- }
- };
-
- ////////////////////
- template
- struct CallAcquire;
- template
- struct CallAcquire {
- template
- static void call(T& ref, const HeterogeneousProductBase* input, Args&&... args) {
- bool succeeded = true;
- DeviceBitSet inputLocation;
- if (input) {
- succeeded = input->isProductOn(Mapping::deviceEnum);
- if (succeeded) {
- inputLocation = input->onDevices(Mapping::deviceEnum);
- }
- }
- if (succeeded) {
- // may not perfect-forward here in order to be able to forward arguments to next CallAcquire.
- succeeded = Mapping::acquire(ref, inputLocation, args...);
- }
- if (!succeeded) {
- CallAcquire::call(ref, input, std::forward(args)...);
- }
- }
- };
- // break recursion and require CPU to be the last
- template
- struct CallAcquire {
- template
- static void call(T& ref, const HeterogeneousProductBase* input, Args&&... args) {
- Mapping::acquire(ref, std::forward(args)...);
- }
- };
-
- ////////////////////
- template
- struct CallProduce;
- template
- struct CallProduce {
- template
- static void call(T& ref, edm::HeterogeneousEvent& iEvent, Args&&... args) {
- if (iEvent.location().deviceType() == Mapping::deviceEnum) {
- Mapping::produce(ref, iEvent, std::forward(args)...);
- } else {
- CallProduce::call(ref, iEvent, std::forward(args)...);
- }
- }
- };
- template
- struct CallProduce {
- template
- static void call(T& ref, Args&&... args) {}
- };
-
- template
- class HeterogeneousDevices : public Devices... {
- public:
- explicit HeterogeneousDevices(const edm::ParameterSet& iConfig) : Devices(iConfig)... {}
-
- static void fillPSetDescription(edm::ParameterSetDescription& desc) {
- // The usual trick to expand the parameter pack for function call
- using expander = int[];
- (void)expander{0, ((void)Devices::fillPSetDescription(desc), 1)...};
- desc.addUntracked("force", "");
- }
-
- void call_beginStream(edm::StreamID id) { CallBeginStream::call(*this, id); }
-
- void call_acquire(const HeterogeneousProductBase* input,
- edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- CallAcquire::call(*this, input, iEvent, iSetup, std::move(waitingTaskHolder));
- }
-
- void call_produce(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
- CallProduce::call(*this, iEvent, iSetup);
- }
- };
-} // end namespace heterogeneous
-
-template
-class HeterogeneousEDProducer : public Devices, public edm::stream::EDProducer {
-public:
- explicit HeterogeneousEDProducer(const edm::ParameterSet& iConfig)
- : Devices(iConfig.getUntrackedParameter("heterogeneousEnabled_")) {}
- ~HeterogeneousEDProducer() override = default;
-
-protected:
- edm::EDGetTokenT consumesHeterogeneous(const edm::InputTag& tag) {
- tokens_.push_back(this->template consumes(tag));
- return tokens_.back();
- }
-
- static void fillPSetDescription(edm::ParameterSetDescription& desc) {
- edm::ParameterSetDescription nested;
- Devices::fillPSetDescription(nested);
- desc.addUntracked("heterogeneousEnabled_", nested);
- }
-
-private:
- void beginStream(edm::StreamID id) override { Devices::call_beginStream(id); }
-
- void acquire(const edm::Event& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) final {
- const HeterogeneousProductBase* input = nullptr;
-
- std::vector products;
- for (const auto& token : tokens_) {
- edm::Handle handle;
- iEvent.getByToken(token, handle);
- if (handle.isValid()) {
- // let the user acquire() code to deal with missing products
- // (and hope they don't mess up the scheduling!)
- products.push_back(handle.product());
- }
- }
- if (!products.empty()) {
- // TODO: check all inputs, not just the first one
- input = products[0]->getBase();
- }
-
- auto eventWrapper = edm::HeterogeneousEvent(&iEvent, &algoExecutionLocation_);
- Devices::call_acquire(input, eventWrapper, iSetup, std::move(waitingTaskHolder));
- }
-
- void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) final {
- if (algoExecutionLocation_.deviceType() == HeterogeneousDeviceId::kInvalidDevice) {
- // TODO: eventually fall back to CPU
- throw cms::Exception("LogicError")
- << "Trying to produce(), but algorithm was not executed successfully anywhere?";
- }
- auto eventWrapper = edm::HeterogeneousEvent(&iEvent, &algoExecutionLocation_);
- Devices::call_produce(eventWrapper, iSetup);
- }
-
- std::vector > tokens_;
- HeterogeneousDeviceId algoExecutionLocation_;
-};
-
-#endif
diff --git a/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h b/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h
deleted file mode 100644
index 26b48ac308028..0000000000000
--- a/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef HeterogeneousCore_Producer_HeterogeneousEvent_h
-#define HeterogeneousCore_Producer_HeterogeneousEvent_h
-
-#include "FWCore/Framework/interface/Event.h"
-#include "DataFormats/Common/interface/Handle.h"
-
-#include "HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-namespace edm {
- class HeterogeneousEvent {
- public:
- HeterogeneousEvent(const edm::Event* event, HeterogeneousDeviceId* location)
- : constEvent_(event), location_(location) {}
- HeterogeneousEvent(edm::Event* event, HeterogeneousDeviceId* location)
- : event_(event), constEvent_(event), location_(location) {}
-
- // Accessors to members
- edm::Event& event() { return *event_; }
- const edm::Event& event() const { return *constEvent_; }
-
- // For the "acquire" phase, the "input location" is used for
- // scheduling, while "location" is used to set the location where
- // the algorithm was run
- void setInputLocation(HeterogeneousDeviceId location) { inputLocation_ = location; }
-
- std::function locationSetter() {
- return [loc = location_](HeterogeneousDeviceId location) { *loc = location; };
- }
- const HeterogeneousDeviceId& location() const { return *location_; }
-
- // Delegate to edm::Event
- auto id() const { return constEvent_->id(); }
- auto streamID() const { return constEvent_->streamID(); }
-
- template
- bool getByToken(const Token& token, edm::Handle& handle) const {
- edm::Handle tmp;
- constEvent_->getByToken(token, tmp);
- if (tmp.failedToGet()) {
- auto copy = tmp.whyFailedFactory();
- handle = edm::Handle(std::move(copy));
- return false;
- }
- if (tmp.isValid()) {
-#define CASE(ENUM) \
- case ENUM: \
- this->template get(handle, tmp, 0); \
- break
- switch (inputLocation_.deviceType()) {
- CASE(HeterogeneousDevice::kCPU);
- CASE(HeterogeneousDevice::kGPUMock);
- CASE(HeterogeneousDevice::kGPUCuda);
- default:
- throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::getByToken(): no case statement for device "
- << static_cast(location().deviceType())
- << ". If you are calling getByToken() from produceX() where X != CPU, "
- "please move the call to acquireX().";
- }
-#undef CASE
- return true;
- }
- return false;
- }
-
- // Delegate standard getByToken to edm::Event
- template
- bool getByToken(const Token& token, edm::Handle& handle) const {
- return constEvent_->getByToken(token, handle);
- }
-
- template
- auto put(std::unique_ptr product) {
- return event_->put(std::move(product));
- }
-
- template
- auto put(std::unique_ptr product, std::string const& productInstanceName) {
- return event_->put(std::move(product), productInstanceName);
- }
-
- template
- void put(std::unique_ptr product) {
- assert(location().deviceType() == HeterogeneousDevice::kCPU);
- event_->put(std::make_unique(
- Product(heterogeneous::HeterogeneousDeviceTag(), std::move(*product))));
- }
-
- template
- auto put(std::unique_ptr product, F transferToCPU) {
- std::unique_ptr prod;
-#define CASE(ENUM) \
- case ENUM: \
- this->template make(prod, std::move(product), std::move(transferToCPU), 0); \
- break
- switch (location().deviceType()) {
- CASE(HeterogeneousDevice::kGPUMock);
- CASE(HeterogeneousDevice::kGPUCuda);
- default:
- throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::put(): no case statement for device "
- << static_cast(location().deviceType());
- }
-#undef CASE
- return event_->put(std::move(prod));
- }
-
- private:
- template
- typename std::enable_if_t::value, void> get(
- edm::Handle& dst, const edm::Handle& src, int) const {
- const auto& concrete = src->get();
- const auto& provenance = src.provenance();
- dst = edm::Handle(&(concrete.template getProduct()), provenance);
- }
- template
- void get(edm::Handle& dst, const edm::Handle& src, long) const {
- throw cms::Exception("Assert") << "Invalid call to get, Device " << static_cast(Device) << " Product "
- << typeid(Product).name() << " Type " << typeid(Type).name()
- << " CanGet::FromType "
- << typeid(typename Product::template CanGet::FromType).name()
- << " CanGet::value " << Product::template CanGet::value;
- }
-
- template
- typename std::enable_if_t::value, void> make(
- std::unique_ptr& ret, std::unique_ptr product, F transferToCPU, int) {
- ret = std::make_unique(Product(
- heterogeneous::HeterogeneousDeviceTag(), std::move(*product), location(), std::move(transferToCPU)));
- }
- template
- void make(std::unique_ptr& ret, std::unique_ptr product, F transferToCPU, long) {
- throw cms::Exception("Assert") << "Invalid call to make, Device " << static_cast(Device) << " Product "
- << typeid(Product).name() << " Type " << typeid(Type).name() << " CanPut::ToType "
- << typeid(typename Product::template CanPut::ToType).name()
- << " CanPut::value " << Product::template CanPut::value;
- }
-
- edm::Event* event_ = nullptr;
- const edm::Event* constEvent_ = nullptr;
- HeterogeneousDeviceId inputLocation_;
- HeterogeneousDeviceId* location_ = nullptr;
- };
-} // end namespace edm
-
-#endif
diff --git a/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc b/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc
deleted file mode 100644
index 0ebe43d38ee3a..0000000000000
--- a/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-
-#include
-#include
-#include
-#include
-
-namespace heterogeneous {
- CPU::~CPU() noexcept(false) {}
-
- bool CPU::call_acquireCPU(edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- // There is no need for acquire in CPU, everything can be done in produceCPU().
- iEvent.locationSetter()(HeterogeneousDeviceId(HeterogeneousDevice::kCPU));
- waitingTaskHolder.doneWaiting(nullptr);
- return true;
- }
-
- void CPU::call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
- // For CPU we set the heterogeneous input location for produce, because there is no acquire
- // For other devices this probably doesn't make sense, because the device code is supposed to be launched from acquire.
- iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kCPU, 0));
- produceCPU(iEvent, iSetup);
- }
-
- GPUMock::GPUMock(const edm::ParameterSet& iConfig)
- : enabled_(iConfig.getUntrackedParameter("GPUMock")),
- forced_(iConfig.getUntrackedParameter("force") == "GPUMock") {}
-
- GPUMock::~GPUMock() noexcept(false) {}
-
- void GPUMock::fillPSetDescription(edm::ParameterSetDescription& desc) { desc.addUntracked("GPUMock", true); }
-
- bool GPUMock::call_acquireGPUMock(DeviceBitSet inputLocation,
- edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- if (!enabled_) {
- edm::LogPrint("HeterogeneousEDProducer")
- << "Mock GPU is not available for this module (disabled in configuration)";
- return false;
- }
-
- if (!forced_) {
- // Decide randomly whether to run on GPU or CPU to simulate scheduler decisions
- std::random_device r;
- std::mt19937 gen(r());
- auto dist1 = std::uniform_int_distribution<>(0, 3); // simulate GPU (in)availability
- if (dist1(gen) == 0) {
- edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available (by chance)";
- return false;
- }
- }
-
- try {
- iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0));
- acquireGPUMock(iEvent,
- iSetup,
- [waitingTaskHolder, // copy needed for the catch block
- locationSetter = iEvent.locationSetter(),
- location = &(iEvent.location())]() mutable {
- locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0));
- waitingTaskHolder.doneWaiting(nullptr);
- });
- } catch (...) {
- waitingTaskHolder.doneWaiting(std::current_exception());
- }
- return true;
- }
-} // namespace heterogeneous
diff --git a/HeterogeneousCore/Producer/test/BuildFile.xml b/HeterogeneousCore/Producer/test/BuildFile.xml
deleted file mode 100644
index 4d56079ce3f87..0000000000000
--- a/HeterogeneousCore/Producer/test/BuildFile.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrency.cc b/HeterogeneousCore/Producer/test/TestGPUConcurrency.cc
deleted file mode 100644
index 2748fcf61d663..0000000000000
--- a/HeterogeneousCore/Producer/test/TestGPUConcurrency.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "TestGPUConcurrency.h"
-#include "TestGPUConcurrencyAlgo.h"
-
-TestGPUConcurrency::TestGPUConcurrency(edm::ParameterSet const& config)
- : HeterogeneousEDProducer(config),
- blocks_(config.getParameter("blocks")),
- threads_(config.getParameter("threads")),
- sleep_(config.getParameter("sleep")) {}
-
-void TestGPUConcurrency::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
- edm::ParameterSetDescription desc;
- HeterogeneousEDProducer::fillPSetDescription(desc);
- desc.add("blocks", 100);
- desc.add("threads", 256);
- desc.add("sleep", 1000000);
- descriptions.add("testGPUConcurrency", desc);
-}
-
-void TestGPUConcurrency::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
- algo_ = new TestGPUConcurrencyAlgo(blocks_, threads_, sleep_);
-}
-
-void TestGPUConcurrency::acquireGPUCuda(const edm::HeterogeneousEvent& event,
- const edm::EventSetup& setup,
- cuda::stream_t<>& cudaStream) {
- algo_->kernelWrapper(cudaStream.id());
-}
-
-void TestGPUConcurrency::produceCPU(edm::HeterogeneousEvent& event, const edm::EventSetup& setup) {}
-
-void TestGPUConcurrency::produceGPUCuda(edm::HeterogeneousEvent& event,
- const edm::EventSetup& setup,
- cuda::stream_t<>& cudaStream) {}
-
-#include "FWCore/Framework/interface/MakerMacros.h"
-DEFINE_FWK_MODULE(TestGPUConcurrency);
diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrency.h b/HeterogeneousCore/Producer/test/TestGPUConcurrency.h
deleted file mode 100644
index e67bc0f4fa6a4..0000000000000
--- a/HeterogeneousCore/Producer/test/TestGPUConcurrency.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef HeterogeneousCore_Producer_test_TestGPUConcurrency_h
-#define HeterogeneousCore_Producer_test_TestGPUConcurrency_h
-
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-class TestGPUConcurrencyAlgo;
-
-/**
- * The purpose of this test is to demonstrate running multiple kernels concurrently on a GPU,
- * associated to different framework streams on he CPU.
- */
-class TestGPUConcurrency
- : public HeterogeneousEDProducer> {
-public:
- explicit TestGPUConcurrency(edm::ParameterSet const& config);
- ~TestGPUConcurrency() override = default;
-
- static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
- using OutputType =
- HeterogeneousProductImpl, heterogeneous::GPUCudaProduct>;
-
- void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override;
- void acquireGPUCuda(const edm::HeterogeneousEvent& event,
- const edm::EventSetup& setup,
- cuda::stream_t<>& cudaStream) override;
- void produceGPUCuda(edm::HeterogeneousEvent& event,
- const edm::EventSetup& setup,
- cuda::stream_t<>& cudaStream) override;
- void produceCPU(edm::HeterogeneousEvent& event, const edm::EventSetup& setup) override;
-
- // GPU code
-private:
- TestGPUConcurrencyAlgo* algo_;
-
- // data members
-private:
- unsigned int blocks_;
- unsigned int threads_;
- unsigned int sleep_;
-};
-
-#endif // HeterogeneousCore_Producer_test_TestGPUConcurrency_h
diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.cu b/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.cu
deleted file mode 100644
index 7e67e0de4a196..0000000000000
--- a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-#include
-#include
-
-#include "TestGPUConcurrencyAlgo.h"
-
-__global__ void kernel(uint32_t sleep) {
- volatile int sum = 0;
- auto index = threadIdx.x + blockDim.x * blockIdx.x;
- if (index < 32)
- for (uint32_t i = 0; i < sleep; ++i)
- sum += i;
-}
-
-void TestGPUConcurrencyAlgo::kernelWrapper(cudaStream_t stream) const {
- kernel<<>>(sleep_);
-}
diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.h b/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.h
deleted file mode 100644
index 1750a43ca92bf..0000000000000
--- a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef HeterogeneousCore_Producer_test_TestGPUConcurrencyAlgo_h
-#define HeterogeneousCore_Producer_test_TestGPUConcurrencyAlgo_h
-
-#include
-
-class TestGPUConcurrencyAlgo {
-public:
- TestGPUConcurrencyAlgo(unsigned int blocks, unsigned int threads, unsigned int sleep)
- : blocks_(blocks), threads_(threads), sleep_(sleep) {}
-
- void kernelWrapper(cudaStream_t stream) const;
-
- // data members
-private:
- unsigned int blocks_;
- unsigned int threads_;
- unsigned int sleep_;
-};
-
-#endif // HeterogeneousCore_Producer_test_TestGPUConcurrencyAlgo_h
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerAnalyzer.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerAnalyzer.cc
deleted file mode 100644
index 43d4facfc7122..0000000000000
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerAnalyzer.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "FWCore/Framework/interface/global/EDAnalyzer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/transform.h"
-
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-#include
-
-class TestHeterogeneousEDProducerAnalyzer : public edm::global::EDAnalyzer<> {
-public:
- explicit TestHeterogeneousEDProducerAnalyzer(edm::ParameterSet const& iConfig);
- ~TestHeterogeneousEDProducerAnalyzer() override = default;
-
- static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
- void analyze(edm::StreamID streamID, const edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
-
- using InputType = HeterogeneousProductImpl,
- heterogeneous::GPUCudaProduct>>;
- std::string label_;
- std::vector> srcTokens_;
-};
-
-TestHeterogeneousEDProducerAnalyzer::TestHeterogeneousEDProducerAnalyzer(const edm::ParameterSet& iConfig)
- : label_(iConfig.getParameter("@module_label")),
- srcTokens_(
- edm::vector_transform(iConfig.getParameter>("src"),
- [this](const edm::InputTag& tag) { return consumes(tag); })) {}
-
-void TestHeterogeneousEDProducerAnalyzer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
- edm::ParameterSetDescription desc;
- desc.add>("src", std::vector{});
- descriptions.add("testHeterogeneousEDProducerAnalyzer", desc);
-}
-
-void TestHeterogeneousEDProducerAnalyzer::analyze(edm::StreamID streamID,
- const edm::Event& iEvent,
- const edm::EventSetup& iSetup) const {
- edm::Handle hinput;
- int inp = 0;
- for (const auto& token : srcTokens_) {
- iEvent.getByToken(token, hinput);
- edm::LogPrint("TestHeterogeneousEDProducerAnalyzer")
- << "Analyzer event " << iEvent.id().event() << " stream " << streamID << " label " << label_ << " coll " << inp
- << " result " << hinput->get().getProduct();
- ++inp;
- }
-}
-
-DEFINE_FWK_MODULE(TestHeterogeneousEDProducerAnalyzer);
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc
deleted file mode 100644
index 415bac567f129..0000000000000
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-#include "TestHeterogeneousEDProducerGPUHelpers.h"
-
-#include
-#include
-#include
-
-#include
-#include
-
-/**
- * The purpose of this test is to demonstrate the following
- * - EDProducer implementing an algorithm for CPU and a CUDA GPU
- * - How to initialize the GPU algorithm and make once-per-job-per-stream allocations on the device
- * - How to read heterogeneous product from event
- * - How to write heterogeneous product to event
- * * Especially pointers to device memory
- */
-class TestHeterogeneousEDProducerGPU
- : public HeterogeneousEDProducer> {
-public:
- explicit TestHeterogeneousEDProducerGPU(edm::ParameterSet const& iConfig);
- ~TestHeterogeneousEDProducerGPU() override = default;
-
- static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
- using OutputType =
- HeterogeneousProductImpl,
- heterogeneous::GPUCudaProduct>;
-
- void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override;
-
- void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- cuda::stream_t<>& cudaStream) override;
-
- void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
- void produceGPUCuda(edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- cuda::stream_t<>& cudaStream) override;
-
- std::string label_;
- edm::EDGetTokenT srcToken_;
-
- // GPU stuff
- std::unique_ptr gpuAlgo_;
- TestHeterogeneousEDProducerGPUTask::ResultType gpuOutput_;
-};
-
-TestHeterogeneousEDProducerGPU::TestHeterogeneousEDProducerGPU(edm::ParameterSet const& iConfig)
- : HeterogeneousEDProducer(iConfig), label_(iConfig.getParameter("@module_label")) {
- auto srcTag = iConfig.getParameter("src");
- if (!srcTag.label().empty()) {
- srcToken_ = consumesHeterogeneous(srcTag);
- }
-
- produces();
-}
-
-void TestHeterogeneousEDProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
- edm::ParameterSetDescription desc;
- desc.add("src", edm::InputTag());
- HeterogeneousEDProducer::fillPSetDescription(desc);
- descriptions.add("testHeterogeneousEDProducerGPU", desc);
-}
-
-void TestHeterogeneousEDProducerGPU::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
- edm::Service cs;
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label_ << " TestHeterogeneousEDProducerGPU::beginStreamGPUCuda begin stream " << streamId << " device "
- << cs->getCurrentDevice();
-
- gpuAlgo_ = std::make_unique();
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label_ << " TestHeterogeneousEDProducerGPU::beginStreamGPUCuda end stream " << streamId << " device "
- << cs->getCurrentDevice();
-}
-
-void TestHeterogeneousEDProducerGPU::acquireGPUCuda(const edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- cuda::stream_t<>& cudaStream) {
- edm::Service cs;
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireGPUCuda begin event " << iEvent.id().event()
- << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice();
-
- gpuOutput_.first.reset();
- gpuOutput_.second.reset();
-
- TestHeterogeneousEDProducerGPUTask::ResultTypeRaw input = std::make_pair(nullptr, nullptr);
- if (!srcToken_.isUninitialized()) {
- edm::Handle hin;
- iEvent.getByToken(srcToken_, hin);
- input = *hin;
- }
-
- gpuOutput_ = gpuAlgo_->runAlgo(label_, 0, input, cudaStream);
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireGPUCuda end event " << iEvent.id().event()
- << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice();
-}
-
-void TestHeterogeneousEDProducerGPU::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << label_ << " TestHeterogeneousEDProducerGPU::produceCPU begin event " << iEvent.id().event() << " stream "
- << iEvent.streamID();
-
- unsigned int input = 0;
- if (!srcToken_.isUninitialized()) {
- edm::Handle hin;
- iEvent.getByToken(srcToken_, hin);
- input = *hin;
- }
-
- std::random_device r;
- std::mt19937 gen(r());
- auto dist = std::uniform_real_distribution<>(1.0, 3.0);
- auto dur = dist(gen);
- edm::LogPrint("TestHeterogeneousEDProducerGPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream "
- << iEvent.streamID() << " will take " << dur << " seconds";
- std::this_thread::sleep_for(std::chrono::seconds(1) * dur);
-
- const unsigned int output = input + iEvent.streamID() * 100 + iEvent.id().event();
-
- iEvent.put(std::make_unique(output));
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << label_ << " TestHeterogeneousEDProducerGPU::produceCPU end event " << iEvent.id().event() << " stream "
- << iEvent.streamID() << " result " << output;
-}
-
-void TestHeterogeneousEDProducerGPU::produceGPUCuda(edm::HeterogeneousEvent& iEvent,
- const edm::EventSetup& iSetup,
- cuda::stream_t<>& cudaStream) {
- edm::Service cs;
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << label_ << " TestHeterogeneousEDProducerGPU::produceGPUCuda begin event " << iEvent.id().event() << " stream "
- << iEvent.streamID() << " device " << cs->getCurrentDevice();
-
- gpuAlgo_->release(label_, cudaStream);
- iEvent.put(
- std::make_unique(gpuOutput_.first.get(),
- gpuOutput_.second.get()),
- [this,
- eventId = iEvent.event().id().event(),
- streamId = iEvent.event().streamID(),
- dev = cs->getCurrentDevice(),
- &cudaStream](const TestHeterogeneousEDProducerGPUTask::ResultTypeRaw& src, unsigned int& dst) {
- // TODO: try to abstract both the current device setting and the delivery of cuda::stream to this function
- // It needs some further thought so I leave it now as it is
- // Maybe "per-thread default stream" would help as they are regular CUDA streams (wrt. to the default stream)?
- // Or not, because the current device has to be set correctly.
- // Maybe we should initiate the transfer in all cases?
- cuda::device::current::scoped_override_t<> setDeviceForThisScope(dev);
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label_ << " Copying from GPU to CPU for event " << eventId << " in stream " << streamId;
- dst = TestHeterogeneousEDProducerGPUTask::getResult(src, cudaStream);
- });
-
- // If, for any reason, you want to disable the automatic GPU->CPU transfer, pass heterogeneous::DisableTransfer{} insteads of the function, i.e.
- //iEvent.put(std::make_unique(gpuOutput_.first.get(), gpuOutput_.second.get()), heterogeneous::DisableTransfer{});
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << label_ << " TestHeterogeneousEDProducerGPU::produceGPUCuda end event " << iEvent.id().event() << " stream "
- << iEvent.streamID() << " device " << cs->getCurrentDevice();
-}
-
-DEFINE_FWK_MODULE(TestHeterogeneousEDProducerGPU);
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
deleted file mode 100644
index 38af6518cdef3..0000000000000
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
+++ /dev/null
@@ -1,208 +0,0 @@
-#include
-#include
-#include
-
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-#include "TestHeterogeneousEDProducerGPUHelpers.h"
-
-//
-// Vector Addition Kernel
-//
-namespace {
- template
- __global__ void vectorAdd(const T *a, const T *b, T *c, int numElements) {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < numElements) {
- c[i] = a[i] + b[i];
- }
- }
-
- template
- __global__ void vectorProd(const T *a, const T *b, T *c, int numElements) {
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (row < numElements && col < numElements) {
- c[row * numElements + col] = a[row] * b[col];
- }
- }
-
- template
- __global__ void matrixMul(const T *a, const T *b, T *c, int numElements) {
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (row < numElements && col < numElements) {
- T tmp = 0;
- for (int i = 0; i < numElements; ++i) {
- tmp += a[row * numElements + i] * b[i * numElements + col];
- }
- c[row * numElements + col] = tmp;
- }
- }
-
- template
- __global__ void matrixMulVector(const T *a, const T *b, T *c, int numElements) {
- int row = blockIdx.y * blockDim.y + threadIdx.y;
-
- if (row < numElements) {
- T tmp = 0;
- for (int i = 0; i < numElements; ++i) {
- tmp += a[row * numElements + i] * b[i];
- }
- c[row] = tmp;
- }
- }
-} // namespace
-
-int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
- // Example from Viktor/cuda-api-wrappers
- constexpr int NUM_VALUES = 10000;
-
- auto current_device = cuda::device::current::get();
- auto stream = current_device.create_stream(cuda::stream::no_implicit_synchronization_with_default_stream);
-
- auto h_a = cudautils::make_host_unique(NUM_VALUES, nullptr);
- auto h_b = cudautils::make_host_unique(NUM_VALUES, nullptr);
- auto h_c = cudautils::make_host_unique(NUM_VALUES, nullptr);
-
- for (auto i = 0; i < NUM_VALUES; i++) {
- h_a[i] = input + i;
- h_b[i] = i * i;
- }
-
- auto d_a = cudautils::make_device_unique(NUM_VALUES, nullptr);
- auto d_b = cudautils::make_device_unique(NUM_VALUES, nullptr);
- auto d_c = cudautils::make_device_unique(NUM_VALUES, nullptr);
-
- cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
- cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
-
- int threadsPerBlock{256};
- int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
-
- vectorAdd<<>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
- /*
- // doesn't work with header-only?
- cudautils::launch(vectorAdd, {blocksPerGrid, threadsPerBlock},
- d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
- */
-
- cudaCheck(cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()));
-
- stream.synchronize();
-
- int ret = 0;
- for (auto i = 0; i < 10; i++) {
- ret += h_c[i];
- }
-
- return ret;
-}
-
-namespace {
- constexpr int NUM_VALUES = 10000;
-}
-
-TestHeterogeneousEDProducerGPUTask::TestHeterogeneousEDProducerGPUTask() {
- h_a = cudautils::make_host_unique(NUM_VALUES, nullptr);
- h_b = cudautils::make_host_unique(NUM_VALUES, nullptr);
-
- auto current_device = cuda::device::current::get();
- d_b = cudautils::make_device_unique(NUM_VALUES, nullptr);
- d_ma = cudautils::make_device_unique(NUM_VALUES * NUM_VALUES, nullptr);
- d_mb = cudautils::make_device_unique(NUM_VALUES * NUM_VALUES, nullptr);
- d_mc = cudautils::make_device_unique(NUM_VALUES * NUM_VALUES, nullptr);
-}
-
-TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTask::runAlgo(
- const std::string &label, int input, const ResultTypeRaw inputArrays, cuda::stream_t<> &stream) {
- // First make the sanity check
- if (inputArrays.first != nullptr) {
- auto h_check = std::make_unique(NUM_VALUES);
- cudaCheck(cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
- for (int i = 0; i < NUM_VALUES; ++i) {
- if (h_check[i] != i) {
- throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
- << h_check[i];
- }
- }
- }
-
- for (auto i = 0; i < NUM_VALUES; i++) {
- h_a[i] = i;
- h_b[i] = i * i;
- }
-
- auto current_device = cuda::device::current::get();
- auto d_a = cudautils::make_device_unique(NUM_VALUES, nullptr);
- auto d_c = cudautils::make_device_unique(NUM_VALUES, nullptr);
- if (inputArrays.second != nullptr) {
- d_d = cudautils::make_device_unique(NUM_VALUES, nullptr);
- }
-
- // Create stream
- cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
- cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
-
- int threadsPerBlock{32};
- int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id();
- vectorAdd<<>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
- if (inputArrays.second != nullptr) {
- vectorAdd<<>>(inputArrays.second, d_c.get(), d_d.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
- std::swap(d_c, d_d);
- }
-
- dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
- dim3 blocksPerGrid3{1, 1};
- if (NUM_VALUES * NUM_VALUES > 32) {
- threadsPerBlock3.x = 32;
- threadsPerBlock3.y = 32;
- blocksPerGrid3.x = ceil(double(NUM_VALUES) / double(threadsPerBlock3.x));
- blocksPerGrid3.y = ceil(double(NUM_VALUES) / double(threadsPerBlock3.y));
- }
- vectorProd<<>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
- vectorProd<<>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
- matrixMul<<>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
- matrixMulVector<<>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
- cudaCheck(cudaGetLastError());
-
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label << " GPU kernels launched, returning return pointer device " << current_device.id()
- << " CUDA stream " << stream.id();
- return std::make_pair(std::move(d_a), std::move(d_c));
-}
-
-void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda::stream_t<> &stream) {
- // any way to automate the release?
- edm::LogPrint("TestHeterogeneousEDProducerGPU")
- << " " << label << " GPU releasing temporary memory device " << cuda::stream::associated_device(stream.id())
- << " CUDA stream " << stream.id();
- d_d.reset();
-}
-
-int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) {
- auto h_c = cudautils::make_device_unique(NUM_VALUES, nullptr);
- cudaCheck(cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()));
- stream.synchronize();
-
- float ret = 0;
- for (auto i = 0; i < NUM_VALUES; i++) {
- ret += h_c[i];
- }
-
- return static_cast(ret);
-}
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.h b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.h
deleted file mode 100644
index 9429f7a343c9e..0000000000000
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef HeterogeneousCore_Producer_TestHeterogneousEDProducerGPUHelpers
-#define HeterogeneousCore_Producer_TestHeterogneousEDProducerGPUHelpers
-
-#include
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-#include
-#include
-#include
-#include
-
-int TestHeterogeneousEDProducerGPUHelpers_simple_kernel(int input);
-
-class TestHeterogeneousEDProducerGPUTask {
-public:
- TestHeterogeneousEDProducerGPUTask();
- ~TestHeterogeneousEDProducerGPUTask() = default;
-
- using Ptr = cudautils::device::unique_ptr;
- using PtrRaw = Ptr::pointer;
-
- using ResultType = std::pair;
- using ResultTypeRaw = std::pair;
- using ConstResultTypeRaw = std::pair;
-
- ResultType runAlgo(const std::string& label, int input, const ResultTypeRaw inputArrays, cuda::stream_t<>& stream);
- void release(const std::string& label, cuda::stream_t<>& stream);
- static int getResult(const ResultTypeRaw& d_ac, cuda::stream_t<>& stream);
-
-private:
- std::unique_ptr> streamPtr;
-
- // stored for the job duration
- cudautils::host::unique_ptr h_a;
- cudautils::host::unique_ptr h_b;
- cudautils::device::unique_ptr d_b;
- cudautils::device::unique_ptr d_ma;
- cudautils::device::unique_ptr d_mb;
- cudautils::device::unique_ptr d_mc;
-
- // temporary storage, need to be somewhere to allow async execution
- cudautils::device::unique_ptr d_d;
-};
-
-#endif
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc
deleted file mode 100644
index bb7c8401ff2f0..0000000000000
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-
-#include "tbb/concurrent_vector.h"
-
-#include
-#include
-#include
-#include
-
-/**
- * The purpose of this test is to demonstrate the following
- * - EDProducer implementing an algorithm for CPU and a (mock GPU) device
- * * The mock device exercises all the structures without a need for actual device
- * - How to read heterogeneous product from event
- * - How to read normal product from event
- * - How to write heterogeneous product to event
- */
-class TestHeterogeneousEDProducerGPUMock
- : public HeterogeneousEDProducer > {
-public:
- explicit TestHeterogeneousEDProducerGPUMock(edm::ParameterSet const& iConfig);
- ~TestHeterogeneousEDProducerGPUMock() override = default;
-
- static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
- using OutputType =
- HeterogeneousProductImpl, heterogeneous::GPUMockProduct >;
-
- std::string label_;
- edm::EDGetTokenT