diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml index 19024b9bc2b1a..75f3d15738429 100644 --- a/CUDADataFormats/BeamSpot/BuildFile.xml +++ b/CUDADataFormats/BeamSpot/BuildFile.xml @@ -1,8 +1,7 @@ + - - diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 9e44bfdf35969..907b7647a3452 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -1,11 +1,12 @@ #ifndef CUDADataFormatsCommonHeterogeneousSoA_H #define CUDADataFormatsCommonHeterogeneousSoA_H -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" // a heterogeneous unique pointer... template diff --git a/HeterogeneousCore/CUDACore/interface/GPUCuda.h b/HeterogeneousCore/CUDACore/interface/GPUCuda.h deleted file mode 100644 index a09fb4bd4f60f..0000000000000 --- a/HeterogeneousCore/CUDACore/interface/GPUCuda.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef HeterogeneousCore_CUDAServices_GPUCuda_h -#define HeterogeneousCore_CUDAServices_GPUCuda_h - -#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" -#include "FWCore/Framework/interface/Frameworkfwd.h" -#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" - -#include "HeterogeneousCore/Producer/interface/DeviceWrapper.h" -#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" - -#include - -#include - -namespace heterogeneous { - class GPUCuda { - public: - using CallbackType = std::function; - - explicit GPUCuda(const edm::ParameterSet& iConfig); - virtual ~GPUCuda() noexcept(false); - - void call_beginStreamGPUCuda(edm::StreamID id); - bool call_acquireGPUCuda(DeviceBitSet inputLocation, - edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder); - void call_produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup); - - static void fillPSetDescription(edm::ParameterSetDescription& desc); - - private: - virtual void beginStreamGPUCuda(edm::StreamID id, cuda::stream_t<>& cudaStream){}; - virtual void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - cuda::stream_t<>& cudaStream) = 0; - virtual void produceGPUCuda(edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - cuda::stream_t<>& cudaStream) = 0; - - std::unique_ptr> cudaStream_; - int deviceId_ = -1; // device assigned to this edm::Stream - bool enabled_; - const bool forced_; - }; - DEFINE_DEVICE_WRAPPER(GPUCuda, HeterogeneousDevice::kGPUCuda); -} // namespace heterogeneous - -#endif diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc deleted file mode 100644 index 706b9e6f9a86c..0000000000000 --- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc +++ /dev/null @@ -1,111 +0,0 @@ -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" - -#include "FWCore/MessageLogger/interface/MessageLogger.h" -#include "FWCore/ServiceRegistry/interface/Service.h" -#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" -#include "chooseCUDADevice.h" -#include - -#include - -namespace heterogeneous { - GPUCuda::GPUCuda(const edm::ParameterSet& iConfig) - : enabled_(iConfig.getUntrackedParameter("GPUCuda")), - forced_(iConfig.getUntrackedParameter("force") == "GPUCuda") { - if (forced_ && !enabled_) { - throw cms::Exception("Configuration") - << "It makes no sense to force the module on GPUCuda, and then disable GPUCuda."; - } - } - - GPUCuda::~GPUCuda() noexcept(false) {} - - void GPUCuda::fillPSetDescription(edm::ParameterSetDescription& desc) { desc.addUntracked("GPUCuda", true); } - - void GPUCuda::call_beginStreamGPUCuda(edm::StreamID id) { - edm::Service cudaService; - enabled_ = (enabled_ && cudaService->enabled()); - if (!enabled_) { - if (forced_) { - throw cms::Exception("LogicError") - << "This module was forced to run on GPUCuda, but the device is not available."; - } - return; - } - - // For startes we "statically" assign the device based on - // edm::Stream number. This is suboptimal if the number of - // edm::Streams is not a multiple of the number of CUDA devices - // (and even then there is no load balancing). - // - // TODO: improve. Possible ideas include - // - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use - // * the first module of a chain dictates the device for the rest of the chain - // - our own CUDA memory allocator - // * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load - // * would probably still need some buffer space/device to hold e.g. conditions data - // - for conditions, how to handle multiple lumis per job? - deviceId_ = id % cudaService->numberOfDevices(); - - cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_); - - // Create the CUDA stream for this module-edm::Stream pair - auto current_device = cuda::device::current::get(); - cudaStream_ = std::make_unique>( - current_device.create_stream(cuda::stream::no_implicit_synchronization_with_default_stream)); - - beginStreamGPUCuda(id, *cudaStream_); - } - - bool GPUCuda::call_acquireGPUCuda(DeviceBitSet inputLocation, - edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - if (!enabled_) { - return false; - } - - // TODO: currently 'forced_ == true' is already assumed. When the - // scheduling logic evolves, add explicit treatment of forced_. - - cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_); - - try { - iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, 0)); - acquireGPUCuda(iEvent, iSetup, *cudaStream_); - cudaStream_->enqueue.callback( - [deviceId = deviceId_, - waitingTaskHolder, // copy needed for the catch block - locationSetter = iEvent.locationSetter()](cuda::stream::id_t streamId, cuda::status_t status) mutable { - if (status == cudaSuccess) { - locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId)); - LogTrace("GPUCuda") << " GPU kernel finished (in callback) device " << deviceId << " CUDA stream " - << streamId; - waitingTaskHolder.doneWaiting(nullptr); - } else { - // wrap the exception in a try-catch block to let GDB "catch throw" break on it - try { - auto error = cudaGetErrorName(status); - auto message = cudaGetErrorString(status); - throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId - << " error " << error << ": " << message; - } catch (cms::Exception&) { - waitingTaskHolder.doneWaiting(std::current_exception()); - } - } - }); - } catch (...) { - waitingTaskHolder.doneWaiting(std::current_exception()); - } - return true; - } - - void GPUCuda::call_produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - // I guess we have to assume that produce() may be called from a different thread than acquire() was run - // The current CUDA device is a thread-local property, so have to set it here - cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_); - - produceGPUCuda(iEvent, iSetup, *cudaStream_); - } -} // namespace heterogeneous diff --git a/HeterogeneousCore/Producer/BuildFile.xml b/HeterogeneousCore/Producer/BuildFile.xml deleted file mode 100644 index b75ff104fc2d9..0000000000000 --- a/HeterogeneousCore/Producer/BuildFile.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/HeterogeneousCore/Producer/README.md b/HeterogeneousCore/Producer/README.md deleted file mode 100644 index 595dec70e1006..0000000000000 --- a/HeterogeneousCore/Producer/README.md +++ /dev/null @@ -1,468 +0,0 @@ -# Prototype for CMSSW interface to heterogenous algorithms - -## Introduction - -This package contains a prtotype for the CMSSW interface to -heterogeneous algorithms. The current implementation is, in a sense, a -mini-framework between the CMSSW core framework and the heterogeneous -algorithms. - -More details can be found from the sub-package specific README files (when they get added). - -## Sub-packages - -* [`CUDACore`](../CUDACore) CUDA-specific core components - - *TODO:* Do we actually need this separate from `CUDAServices`? Which one to keep? -* [`CUDAServices`](../CUDAServices) Various edm::Services related to CUDA -* [`CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code -* [`Producer`](#heterogeneousedproducer) Core of the mini-framework for code organization: a base EDProducer class with algorithm scheduling to devices -* [`Product`](../Product) Core of the mini-framework for data products - -## Design goals - -1. Same module configuration should work on all machines (whether they have heterogeneous devices or not) -2. Keep track of where data is located -3. Run algorithms on the device where their input is located if possible -4. Transfer temporary/transient data to CPU only if needed -5. Abstract away repeated boilerplate code - -## Design considerations - -All below is assuming we do not touch the core CMSSW framework (that -is left for a later exercise when we know better what exactly we want -to do). - -1. The device-specific algorithms must be implemented and scheduled to the device(s) within a single EDProducer -2. Need a special product keeping track of the data location -3. Information of all the input heterogeneous devices must be propagated to the point deciding the device to run the algorithm -4. The special product e.g. holds functions to do the transfer that are called if/when needed - -## General TODO items - -There are also many, more specific TODO items mentioned in comments -within the code. The items below are more general topics (in no -particular order). - -* Improve algorithm-to-device scheduling - - Currently if an algoritm has a GPU implementation and system has a - GPU, the algorithm is always scheduled to the GPU - * This may lead to under-utilization of the CPU if most of the - computations are offloaded to the GPU - - An essential question for making this scheduling more dynamic is - what exactly (we want) it (to) means that a "GPU is too busy" so - it is better to run the algorithm on a CPU - - Possible ideas to explore - * Check the CUDA device utilization (see also monitoring point below) - - Past/current state does not guarantee much about the near future - * Use "tokens" as a resource allocation mechanism - - How many tokens per device? - - What if there no free tokens now but one becomes available after 1 ms? - * In acquire, if GPU is "busy", put the EDProducer to a queue of - heterogeneous tasks. When GPU "becomes available", pick an - EDProducer from the queue and run it in GPU. If CPU runs out of - job, pick an EDProducer from the queue and run it in CPU. - - How to define "busy" and "becomes available"? - - How to become aware that CPU runs out of job? - * Can we create a TBB task that is executed only if there is nothing else to do? - - How does this interact with possible other devices? E.g. if an algorithm has implementations for CPU, GPU, and FPGA? -* Improve edm::Stream-to-CUDA-device scheduling - - Currently each edm::Stream is assigned "statically" to each CUDA device in a round-robin fastion - * There is no load balancing so throughput and utilization will not be optimal - - The reasons for bothering with this is that the `cudaMalloc` is a - heavy operation (see next point) not to be called for each event. - Instead we preallocate the buffers the algorithms need at the - initialization time. In the presence of multiple devices this - pre-allocation leads to a need to somehow match the edm::Streams - and devices. - - Possible ideas to explore - * Naively we could allocate a buffer per edm::Stream in each CUDA device - - Amount of allocated memory is rather excessive - * For N edm::Streams and M GPUs, allocate int((N+1)/M) buffers on each device, eventually pick the least utilized GPU - - No unnecessary buffers - - Need a global list/queue of these buffers per module - * Can the list be abstracted? If not, this solution scales poorly with modules - * Our own CUDA memory allocator that provides a fast way to allocate scratch buffers - - Allows allocating the buffers on-demand on the "best-suited" device -* Our own CUDA memory allocator - - A `cudaMalloc` is a global synchronization point and takes time, - so we want to minimize their calls. This is the main reason to - assign edm::Streams to CUDA devices (see previous point). - - Well-performing allocators are typically highly non-trivial to construct -* Conditions data on GPU - - Currently each module takes care of formatting, transferring, and updating the conditions data to GPU - - This is probably good-enough for the current prototyping phase, but what about longer term? - * How to deal with multiple devices, multiple edm::Streams, and multiple lumi sections in flight? - * Do we need to make EventSetup aware of the devices? How much do the details depend on device type? -* Add possibility to initiate the GPU->CPU transfer before the CPU product is needed - - This would enable overlapping the GPU->CPU transfer while CPU is busy - with other work, so the CPU product requestor would not have to wait -* Improve configurability - - E.g. for preferred device order? -* Add fault tolerance - - E.g. in a case of a GPU running out of memory continue with CPU - - Should be configurable -* Add support for multiple heterogeneous inputs for a module - - Currently the device scheduling is based only on the "first input" - - Clearly this is inadequate in general and needs to be improved - - Any better suggestions than taking `and` of all locations? -* Improve resource monitoring - - E.g. for CUDA device utilization - * https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g540824faa6cef45500e0d1dc2f50b321 - * https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t - - Include in `CUDAService` or add a separete monitoring service? -* Add support for a mode similar to `tbb::streaming_node` - - Current way of `HeterogeneousEDProducer` with the `ExternalWork` resembles `tbb::async_node` - - In principle the `streaming_node`-way would, for a chain of - GPU-enabled modules, allow the GPU to immediately continue to the - next module without waiting the code path to go through the CMSSW - framework -* Add support for more devices - - E.g. OpenCL, FPGA, remote offload -* Explore the implementation of these features into the core CMSSW framework - - E.g. HeterogeneousProduct would likely go to edm::Wrapper -* Explore how to make core framework/TBB scheduling aware of heterogenous devices - -# HeterogeneousEDProducer - -`HeterogeneousEDProducer` is implemented as a `stream::EDProducer` using the -[`ExternalWork` extension](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork). - -## Configuration - -`HeterogeneousEDProducer` requires `heterogeneousEnabled_` `cms.PSet` -to be available in the module configuration. The `PSet` can be used to -override module-by-module which devices can be used by that module. -The structure of the `PSet` are shown in the following example - -```python -process.foo = cms.EDModule("FooProducer", - a = cms.int32(1), - b = cms.VPSet(...), - ... - heterogeneousEnabled_ = cms.untracked.PSet( - GPUCuda = cms.untracked.bool(True), - FPGA = cms.untracked.bool(False), # forbid the module from running on a device type (note that we don't support FPGA devices at the moment though) - force = cms.untracked.string("GPUCuda") # force the module to run on a specific device type - ) -) -``` - -The difference between the boolean flags and the `force` parameter is the following -* The boolean flags control whether the algorithm can be scheduled on the individual device type or not -* The `force` parameter implies that the algorithm is always scheduled on that device type no matter what. If the device type is not available on the machine, an exception is thrown. - -Currently, with only CUDA GPU and CPU support, this level of configurability is a bit overkill though. - -## Class declaration - -In order to use the `HeterogeneousEDProducer` the `EDProducer` class -must inherit from `HeterogeneousEDProducer<...>`. The devices, which -the `EDProducer` is supposed to support, are given as a template -argument via `heterogeneous::HeterogeneousDevices<...>`. The usual -[stream producer extensions](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#Template_Arguments) -can be also passed via additional template arguments. - -```cpp -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" // needed for heterogeneous::GPUCuda - -class FooProducer: public HeterogeneousEDProducer< - heterogeneous::HeterogeneousDevices< - heterogeneous::GPUCuda, - heterogeneous::CPU - > - // here you can pass any stream producer extensions -> { - ... - -``` - -In this example the `FooProducer` declares that prodives -implementations for CUDA GPU and CPU. Note that currently CPU is -mandatory, and it has to be the last argument. The order of the -devices dictates the order that the algorithm is scheduled to the -devices. E.g. in this example, the system runs the algorithm in GPU if -it can, and only if it can not, in CPU. For the list of supported -device types, see the [list below](#devices). - -## Constructor - -`HeterogeneousEDProducer` needs access to the configuration, so it has -to be passed to its constructor as in the following example - -```cpp -FooProducer::FooProducer(edm::ParameterSet const& iConfig): - HeterogeneousEDProducer(iConfig), - ... -``` -### Consumes - -If the `EDProducer` reads any `HeterogeneousProduct`'s -([see more details](#heterogeneousproduct)), the `consumes()` call -should be made along the following - -```cpp -class FooProducer ... { - ... - EDGetTokenT token_; -}; -... -FooProducer::FooProducer(edm::ParameterSet const& iConfig): - ... - token_(consumesHeterogeneous(iConfig.getParameter("..."))), - ... -``` - -so that `HeterogeneousEDProducer` can inspect the location of input -heterogeneous products to decide on which device to run the algorithm -([see more details](#device-scheduling)). - -### Produces - -If the `EDProducer` produces any `HeterogeneousProduct`'s -([see more details](#heterogeneousproduct)), the `produces()` call -should be made along the following (i.e. as usual) - -```cpp -FooProducer::FooProducer(edm::ParameterSet const& iConfig) ... { - ... - produces(); -} -``` - -## fillDescriptions() - -`HeterogeneousEDProducer` provides a `fillPSetDescription()` function -that can be called from the concrete `EDProducer`'s -`fillDescriptions()` as in the following example - -```cpp -void FooProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - - // fill desc for other parameters - - HeterogeneousEDProducer::fillPSetDescription(desc); - - descriptions.add("fooProducer", desc); -} -``` - -## Device scheduling - -The per-event scheduling of algorithms is currently as follows -1. If there are no `HeterogeneousProduct` inputs ([see more details](#heterogeneousproduct)) - * Loop over the device types in the order specified in `heterogeneous::HeterogeneousDevices<...>` template arguments - * Run the algorithm on the first device that is enabled for that module (instance) -2. If there are `HeterogeneousProduct` inputs - * Run the algorithm on **the device where the data of the first input resides** - -## HeterogeneousProduct - -The `HeterogeneousProduct` is a transient edm product with the following properties -* placeholder for products (of arbitrary types) in all device types -* tracks the location of the data -* automatic, on-demand transfers from device to CPU - - developer has to provide a function to do the transfer and possible data reorganiazation - -Some of the complexity exists to avoid ROOT dictionary generation of the concrete product types. - -## HeterogeneousEvent - -The `HeterogeneousEvent` is a wrapper on top of `edm::Event` to hide -most of the complexity of `HeterogeneousProduct` to make its use look -almost like standard products with `edm::Event`. Some part of the -`edm::Event` interface is implemented (and delegated back to -`edm::Event`) in order to get/put standard products as well. - -Here is a short example how to deal use `HeterogeneousProduct` with -`HeterogeneousEvent` (using the same `FooProducer` example as before) - -```cpp -class FooProducer ... { - ... - // in principle these definitions should be treated like DataFormats - struct CPUProduct { - std::vector foo; - }; - struct GPUProduct { - float *foo_d; // pointer to GPU memory - } - - using InputType = HeterogeneousProductImpl, - heterogeneous::GPUProduct>; - using OutputType = InputType; // using same input and output only because of laziness - - void transferGPUtoCPU(GPUProduct const& gpu, CPUProduct& cpu) const; -}; - -void FooProducer::produceCPU(edm::HeterogeneousEvent const& iEvent, ...) { - edm::Handle hinput; - iEvent.getByToken(token_, hinput); // note the InputType template argument - - // do whatever you want with hinput->foo; - - auto output = std::make_unique(...); - iEvent.put(std::move(output)); // note the OutputType template argument -} - -void FooProducer::acquireGPUCuda(edm::HeterogeneousEvent const& iEvent, ...) { - edm::Handle hinput; - iEvent.getByToken(token_, hinput); // note the InputType template argument - - // do whatever you want with hinput->foo_d; - - auto output = std::make_unique(...); - // For non-CPU products, a GPU->CPU transfer function must be provided - // In this example it is prodided as a lambda calling a member function, but this is not required - // The function can be anything assignable to std::function - iEvent.put(std::move(output), [this](GPUProduct const& gpu, CPUProduct& cpu) { // note the OutputType template argument - this->transferGPUtoCPU(gpu, cpu); - }); - // It is also possible to disable the GPU->CPU transfer - // If the data resides on a GPU, and the corresponding CPU product is requested, an exception is thrown - //iEvent.put(std::move(output), heterogeneous::DisableTransfer); // note the OutputType template argument -} - -``` - - -## Devices - -This section documents which functions the `EDProducer` can/has to -implement for various devices. - -### CPU - -A CPU implementation is declared by giving `heterogeneous::CPU` as a -template argument to `heterogeneous::HeterogeneousDevices`. Currently -it is a mandatory argument, and has to be the last one (i.e. there -must always be a CPU implementation, which is used as the last resort -if there are no other devices). - -#### Optional functions - -There is one optional function - -```cpp -void beginStreamCPU(edm::StreamID id); -``` - -which is called at the beginning of an `edm::Stream`. Usually there is -no need to implement it, but the possibility is provided in case it is -needed for something (as the `stream::EDProducer::beginStream()` is -overridden by `HeterogeneousEDProducer`). - -#### Mandatory functions - -There is one mandatory function - -```cpp -void produceCPU(edm::HeterogeneousEvent& iEvent, edm::EventSetup const& iSetup); -``` - -which is almost equal to the usual `stream::EDProducer::produce()` -function. It is called from `HeterogeneousEDProducer::produce()` if -the device scheduler decides that the algorithm should be run on CPU -([see more details](#device-scheduling)). The first argument is -`edm::HeterogeneousEvent` instead of the usual `edm::Event` -([see more details](#heterogeneousevent)). - -The function should read its input, run the algorithm, and put the -output to the event. - -### CUDA GPU - -A CUDA GPU implementation is declared by giving -`heterogeneous::GPUCuda` as a template argument to -`heterogeneous::HeterogeneousDevices`. The following `#include` is -also needed -```cpp -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" -``` -#### Optional functions - -There is one optional function - -```cpp -void beginStreamGPUCuda(edm::StreamID id, cuda::stream_t<>& cudaStream); -``` - -which is called at the beginning of an `edm::Stream`. **If the -algorithm has to allocate memory buffers for the duration of the whole -job, the recommended place is here.** The current CUDA device is set -by the framework before the call, and all asynchronous tasks should be -enqueued to the CUDA stream given as an argument. - -Currently the very same CUDA stream object will be given to the -`acquireGPUCuda()` and `produceGPUCuda()` for this `edm::Stream`. This -may change in the future though, in which case it will still be -guaranteed that the CUDA stream here will be synchronized before the -first call to `acquireGPUCuda()`. - -#### Mandatory functions - -There are two mandatory functions: - -```cpp -void acquireGPUCuda(edm::HeterogeneousEvent const& iEvent, edm::EventSetup const& iSetup, cuda::stream_t<>& cudaStream); -``` - -The `acquireGPUCuda()` is called from -`HeterogeneousEDProducer::acquire()` if the device scheduler devices -that the algorithm should be run on a CUDA GPU -([see more details](#device-scheduling)). The function should read -the necessary input (which may possibly be already on a GPU, -([see more details](#heterogeneousproduct)), and enqueue the -*asynchronous* work on the CUDA stream given as an argument. The -current CUDA deviceis set by the framework before the call. After the -`acquireGPUCuda()` returns, framework will itself enqueue a callback -function to the CUDA stream that will call -`edm::WaitingTaskWithArenaHolder::doneWaiting()` to signal to the -framework that this `EDProducer` is ready to transition to -`produce()`. - -Currently the very same CUDA stream will be given to the -`produceGPUCuda()`. - - -```cpp -void produceGPUCuda(edm::HeterogeneousEvent& iEvent, edm::EventSetup const& iSetup, cuda::stream_t<>& cudaStream); -``` - -The `produceGPUCuda()` is called from -`HeterogeneousEDProducer::produce()` if the algorithm was run on a -CUDA GPU. The function should do any necessary GPU->CPU transfers, -post-processing, and put the products to the event (for passing "GPU -products" [see here](#heterogeneousproduct)). - -#### Memory allocations - -The `cudaMalloc()` is somewhat heavy function (synchronizing the whole -device, among others). The current strategy (although not enforced by -the framework) is to allocate memory buffers at the beginning of a -job. It is recommended to do these allocations in the -`beginStreamGPUCuda()`, as it is called exactly once per job per -`stream::EDProducer` instance, and it is the earliest point in the -framework where we have the concept of `edm::Stream` so that the -framework can assign the `edm::Stream`s to CUDA devices -([see more details](#multipledevices)). - -Freeing the GPU memory can be done in the destructor as it does not -require any special support from the framework. - -#### Multiple devices - -Currently `edm::Stream`'s are statically assigned to CUDA devices in a -round-robin fashion. The assignment is done at the `beginStream()` -time before calling the `EDProducer` `beginStreamDevice()` functions. - -Technically "assigning `edm::Stream`" means that each relevant -`EDProducer` instance of that `edm::Stream` will hold a device id of -`streamId % numberOfDevices`. - -### Mock GPU - -The `GPUMock` is intended only for testing of the framework as a -something requiring a GPU-like interface but still ran on the CPU. The -documentation is left to be the code itself. diff --git a/HeterogeneousCore/Producer/interface/DeviceWrapper.h b/HeterogeneousCore/Producer/interface/DeviceWrapper.h deleted file mode 100644 index 3bcb4f83c6c73..0000000000000 --- a/HeterogeneousCore/Producer/interface/DeviceWrapper.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef HeterogeneousCore_Producer_DeviceWrapper_h -#define HeterogeneousCore_Producer_DeviceWrapper_h - -namespace heterogeneous { - template - struct Mapping; -} - -#define DEFINE_DEVICE_WRAPPER(DEVICE, ENUM) \ - template <> \ - struct Mapping { \ - template \ - static void beginStream(DEVICE& algo, Args&&... args) { \ - algo.call_beginStream##DEVICE(std::forward(args)...); \ - } \ - template \ - static bool acquire(DEVICE& algo, Args&&... args) { \ - return algo.call_acquire##DEVICE(std::forward(args)...); \ - } \ - template \ - static void produce(DEVICE& algo, Args&&... args) { \ - algo.call_produce##DEVICE(std::forward(args)...); \ - } \ - static constexpr HeterogeneousDevice deviceEnum = ENUM; \ - } - -#endif diff --git a/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h b/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h deleted file mode 100644 index ea9e6e9f7a210..0000000000000 --- a/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h +++ /dev/null @@ -1,228 +0,0 @@ -#ifndef HeterogeneousCore_Producer_HeterogeneousEDProducer_h -#define HeterogeneousCore_Producer_HeterogeneousEDProducer_h - -#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" -#include "FWCore/Framework/interface/stream/EDProducer.h" -#include "FWCore/Framework/interface/Frameworkfwd.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/Utilities/interface/Exception.h" - -#include "DataFormats/Common/interface/Handle.h" - -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" -#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" -#include "HeterogeneousCore/Producer/interface/DeviceWrapper.h" - -namespace heterogeneous { - class CPU { - public: - explicit CPU(const edm::ParameterSet& iConfig) {} - virtual ~CPU() noexcept(false); - - static void fillPSetDescription(edm::ParameterSetDescription desc) {} - - void call_beginStreamCPU(edm::StreamID id) { beginStreamCPU(id); } - bool call_acquireCPU(edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder); - void call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup); - - private: - virtual void beginStreamCPU(edm::StreamID id){}; - virtual void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0; - }; - DEFINE_DEVICE_WRAPPER(CPU, HeterogeneousDevice::kCPU); - - class GPUMock { - public: - explicit GPUMock(const edm::ParameterSet& iConfig); - virtual ~GPUMock() noexcept(false); - - static void fillPSetDescription(edm::ParameterSetDescription& desc); - - void call_beginStreamGPUMock(edm::StreamID id) { beginStreamGPUMock(id); } - bool call_acquireGPUMock(DeviceBitSet inputLocation, - edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder); - void call_produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - produceGPUMock(iEvent, iSetup); - } - - private: - virtual void beginStreamGPUMock(edm::StreamID id){}; - virtual void acquireGPUMock(const edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - std::function callback) = 0; - virtual void produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0; - - const bool enabled_; - const bool forced_; - }; - DEFINE_DEVICE_WRAPPER(GPUMock, HeterogeneousDevice::kGPUMock); -} // namespace heterogeneous - -namespace heterogeneous { - //////////////////// - template - struct CallBeginStream; - template - struct CallBeginStream { - template - static void call(T& ref, Args&&... args) { - // may not perfect-forward here in order to be able to forward arguments to next CallBeginStream. - Mapping::beginStream(ref, args...); - CallBeginStream::call(ref, std::forward(args)...); - } - }; - // break recursion and require CPU to be the last - template - struct CallBeginStream { - template - static void call(T& ref, Args&&... args) { - Mapping::beginStream(ref, std::forward(args)...); - } - }; - - //////////////////// - template - struct CallAcquire; - template - struct CallAcquire { - template - static void call(T& ref, const HeterogeneousProductBase* input, Args&&... args) { - bool succeeded = true; - DeviceBitSet inputLocation; - if (input) { - succeeded = input->isProductOn(Mapping::deviceEnum); - if (succeeded) { - inputLocation = input->onDevices(Mapping::deviceEnum); - } - } - if (succeeded) { - // may not perfect-forward here in order to be able to forward arguments to next CallAcquire. - succeeded = Mapping::acquire(ref, inputLocation, args...); - } - if (!succeeded) { - CallAcquire::call(ref, input, std::forward(args)...); - } - } - }; - // break recursion and require CPU to be the last - template - struct CallAcquire { - template - static void call(T& ref, const HeterogeneousProductBase* input, Args&&... args) { - Mapping::acquire(ref, std::forward(args)...); - } - }; - - //////////////////// - template - struct CallProduce; - template - struct CallProduce { - template - static void call(T& ref, edm::HeterogeneousEvent& iEvent, Args&&... args) { - if (iEvent.location().deviceType() == Mapping::deviceEnum) { - Mapping::produce(ref, iEvent, std::forward(args)...); - } else { - CallProduce::call(ref, iEvent, std::forward(args)...); - } - } - }; - template - struct CallProduce { - template - static void call(T& ref, Args&&... args) {} - }; - - template - class HeterogeneousDevices : public Devices... { - public: - explicit HeterogeneousDevices(const edm::ParameterSet& iConfig) : Devices(iConfig)... {} - - static void fillPSetDescription(edm::ParameterSetDescription& desc) { - // The usual trick to expand the parameter pack for function call - using expander = int[]; - (void)expander{0, ((void)Devices::fillPSetDescription(desc), 1)...}; - desc.addUntracked("force", ""); - } - - void call_beginStream(edm::StreamID id) { CallBeginStream::call(*this, id); } - - void call_acquire(const HeterogeneousProductBase* input, - edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - CallAcquire::call(*this, input, iEvent, iSetup, std::move(waitingTaskHolder)); - } - - void call_produce(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - CallProduce::call(*this, iEvent, iSetup); - } - }; -} // end namespace heterogeneous - -template -class HeterogeneousEDProducer : public Devices, public edm::stream::EDProducer { -public: - explicit HeterogeneousEDProducer(const edm::ParameterSet& iConfig) - : Devices(iConfig.getUntrackedParameter("heterogeneousEnabled_")) {} - ~HeterogeneousEDProducer() override = default; - -protected: - edm::EDGetTokenT consumesHeterogeneous(const edm::InputTag& tag) { - tokens_.push_back(this->template consumes(tag)); - return tokens_.back(); - } - - static void fillPSetDescription(edm::ParameterSetDescription& desc) { - edm::ParameterSetDescription nested; - Devices::fillPSetDescription(nested); - desc.addUntracked("heterogeneousEnabled_", nested); - } - -private: - void beginStream(edm::StreamID id) override { Devices::call_beginStream(id); } - - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) final { - const HeterogeneousProductBase* input = nullptr; - - std::vector products; - for (const auto& token : tokens_) { - edm::Handle handle; - iEvent.getByToken(token, handle); - if (handle.isValid()) { - // let the user acquire() code to deal with missing products - // (and hope they don't mess up the scheduling!) - products.push_back(handle.product()); - } - } - if (!products.empty()) { - // TODO: check all inputs, not just the first one - input = products[0]->getBase(); - } - - auto eventWrapper = edm::HeterogeneousEvent(&iEvent, &algoExecutionLocation_); - Devices::call_acquire(input, eventWrapper, iSetup, std::move(waitingTaskHolder)); - } - - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) final { - if (algoExecutionLocation_.deviceType() == HeterogeneousDeviceId::kInvalidDevice) { - // TODO: eventually fall back to CPU - throw cms::Exception("LogicError") - << "Trying to produce(), but algorithm was not executed successfully anywhere?"; - } - auto eventWrapper = edm::HeterogeneousEvent(&iEvent, &algoExecutionLocation_); - Devices::call_produce(eventWrapper, iSetup); - } - - std::vector > tokens_; - HeterogeneousDeviceId algoExecutionLocation_; -}; - -#endif diff --git a/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h b/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h deleted file mode 100644 index 26b48ac308028..0000000000000 --- a/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef HeterogeneousCore_Producer_HeterogeneousEvent_h -#define HeterogeneousCore_Producer_HeterogeneousEvent_h - -#include "FWCore/Framework/interface/Event.h" -#include "DataFormats/Common/interface/Handle.h" - -#include "HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -namespace edm { - class HeterogeneousEvent { - public: - HeterogeneousEvent(const edm::Event* event, HeterogeneousDeviceId* location) - : constEvent_(event), location_(location) {} - HeterogeneousEvent(edm::Event* event, HeterogeneousDeviceId* location) - : event_(event), constEvent_(event), location_(location) {} - - // Accessors to members - edm::Event& event() { return *event_; } - const edm::Event& event() const { return *constEvent_; } - - // For the "acquire" phase, the "input location" is used for - // scheduling, while "location" is used to set the location where - // the algorithm was run - void setInputLocation(HeterogeneousDeviceId location) { inputLocation_ = location; } - - std::function locationSetter() { - return [loc = location_](HeterogeneousDeviceId location) { *loc = location; }; - } - const HeterogeneousDeviceId& location() const { return *location_; } - - // Delegate to edm::Event - auto id() const { return constEvent_->id(); } - auto streamID() const { return constEvent_->streamID(); } - - template - bool getByToken(const Token& token, edm::Handle& handle) const { - edm::Handle tmp; - constEvent_->getByToken(token, tmp); - if (tmp.failedToGet()) { - auto copy = tmp.whyFailedFactory(); - handle = edm::Handle(std::move(copy)); - return false; - } - if (tmp.isValid()) { -#define CASE(ENUM) \ - case ENUM: \ - this->template get(handle, tmp, 0); \ - break - switch (inputLocation_.deviceType()) { - CASE(HeterogeneousDevice::kCPU); - CASE(HeterogeneousDevice::kGPUMock); - CASE(HeterogeneousDevice::kGPUCuda); - default: - throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::getByToken(): no case statement for device " - << static_cast(location().deviceType()) - << ". If you are calling getByToken() from produceX() where X != CPU, " - "please move the call to acquireX()."; - } -#undef CASE - return true; - } - return false; - } - - // Delegate standard getByToken to edm::Event - template - bool getByToken(const Token& token, edm::Handle& handle) const { - return constEvent_->getByToken(token, handle); - } - - template - auto put(std::unique_ptr product) { - return event_->put(std::move(product)); - } - - template - auto put(std::unique_ptr product, std::string const& productInstanceName) { - return event_->put(std::move(product), productInstanceName); - } - - template - void put(std::unique_ptr product) { - assert(location().deviceType() == HeterogeneousDevice::kCPU); - event_->put(std::make_unique( - Product(heterogeneous::HeterogeneousDeviceTag(), std::move(*product)))); - } - - template - auto put(std::unique_ptr product, F transferToCPU) { - std::unique_ptr prod; -#define CASE(ENUM) \ - case ENUM: \ - this->template make(prod, std::move(product), std::move(transferToCPU), 0); \ - break - switch (location().deviceType()) { - CASE(HeterogeneousDevice::kGPUMock); - CASE(HeterogeneousDevice::kGPUCuda); - default: - throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::put(): no case statement for device " - << static_cast(location().deviceType()); - } -#undef CASE - return event_->put(std::move(prod)); - } - - private: - template - typename std::enable_if_t::value, void> get( - edm::Handle& dst, const edm::Handle& src, int) const { - const auto& concrete = src->get(); - const auto& provenance = src.provenance(); - dst = edm::Handle(&(concrete.template getProduct()), provenance); - } - template - void get(edm::Handle& dst, const edm::Handle& src, long) const { - throw cms::Exception("Assert") << "Invalid call to get, Device " << static_cast(Device) << " Product " - << typeid(Product).name() << " Type " << typeid(Type).name() - << " CanGet::FromType " - << typeid(typename Product::template CanGet::FromType).name() - << " CanGet::value " << Product::template CanGet::value; - } - - template - typename std::enable_if_t::value, void> make( - std::unique_ptr& ret, std::unique_ptr product, F transferToCPU, int) { - ret = std::make_unique(Product( - heterogeneous::HeterogeneousDeviceTag(), std::move(*product), location(), std::move(transferToCPU))); - } - template - void make(std::unique_ptr& ret, std::unique_ptr product, F transferToCPU, long) { - throw cms::Exception("Assert") << "Invalid call to make, Device " << static_cast(Device) << " Product " - << typeid(Product).name() << " Type " << typeid(Type).name() << " CanPut::ToType " - << typeid(typename Product::template CanPut::ToType).name() - << " CanPut::value " << Product::template CanPut::value; - } - - edm::Event* event_ = nullptr; - const edm::Event* constEvent_ = nullptr; - HeterogeneousDeviceId inputLocation_; - HeterogeneousDeviceId* location_ = nullptr; - }; -} // end namespace edm - -#endif diff --git a/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc b/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc deleted file mode 100644 index 0ebe43d38ee3a..0000000000000 --- a/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc +++ /dev/null @@ -1,73 +0,0 @@ -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" - -#include "FWCore/MessageLogger/interface/MessageLogger.h" - -#include -#include -#include -#include - -namespace heterogeneous { - CPU::~CPU() noexcept(false) {} - - bool CPU::call_acquireCPU(edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - // There is no need for acquire in CPU, everything can be done in produceCPU(). - iEvent.locationSetter()(HeterogeneousDeviceId(HeterogeneousDevice::kCPU)); - waitingTaskHolder.doneWaiting(nullptr); - return true; - } - - void CPU::call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - // For CPU we set the heterogeneous input location for produce, because there is no acquire - // For other devices this probably doesn't make sense, because the device code is supposed to be launched from acquire. - iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kCPU, 0)); - produceCPU(iEvent, iSetup); - } - - GPUMock::GPUMock(const edm::ParameterSet& iConfig) - : enabled_(iConfig.getUntrackedParameter("GPUMock")), - forced_(iConfig.getUntrackedParameter("force") == "GPUMock") {} - - GPUMock::~GPUMock() noexcept(false) {} - - void GPUMock::fillPSetDescription(edm::ParameterSetDescription& desc) { desc.addUntracked("GPUMock", true); } - - bool GPUMock::call_acquireGPUMock(DeviceBitSet inputLocation, - edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - if (!enabled_) { - edm::LogPrint("HeterogeneousEDProducer") - << "Mock GPU is not available for this module (disabled in configuration)"; - return false; - } - - if (!forced_) { - // Decide randomly whether to run on GPU or CPU to simulate scheduler decisions - std::random_device r; - std::mt19937 gen(r()); - auto dist1 = std::uniform_int_distribution<>(0, 3); // simulate GPU (in)availability - if (dist1(gen) == 0) { - edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available (by chance)"; - return false; - } - } - - try { - iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0)); - acquireGPUMock(iEvent, - iSetup, - [waitingTaskHolder, // copy needed for the catch block - locationSetter = iEvent.locationSetter(), - location = &(iEvent.location())]() mutable { - locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0)); - waitingTaskHolder.doneWaiting(nullptr); - }); - } catch (...) { - waitingTaskHolder.doneWaiting(std::current_exception()); - } - return true; - } -} // namespace heterogeneous diff --git a/HeterogeneousCore/Producer/test/BuildFile.xml b/HeterogeneousCore/Producer/test/BuildFile.xml deleted file mode 100644 index 4d56079ce3f87..0000000000000 --- a/HeterogeneousCore/Producer/test/BuildFile.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrency.cc b/HeterogeneousCore/Producer/test/TestGPUConcurrency.cc deleted file mode 100644 index 2748fcf61d663..0000000000000 --- a/HeterogeneousCore/Producer/test/TestGPUConcurrency.cc +++ /dev/null @@ -1,41 +0,0 @@ -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/ServiceRegistry/interface/Service.h" -#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "TestGPUConcurrency.h" -#include "TestGPUConcurrencyAlgo.h" - -TestGPUConcurrency::TestGPUConcurrency(edm::ParameterSet const& config) - : HeterogeneousEDProducer(config), - blocks_(config.getParameter("blocks")), - threads_(config.getParameter("threads")), - sleep_(config.getParameter("sleep")) {} - -void TestGPUConcurrency::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - HeterogeneousEDProducer::fillPSetDescription(desc); - desc.add("blocks", 100); - desc.add("threads", 256); - desc.add("sleep", 1000000); - descriptions.add("testGPUConcurrency", desc); -} - -void TestGPUConcurrency::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { - algo_ = new TestGPUConcurrencyAlgo(blocks_, threads_, sleep_); -} - -void TestGPUConcurrency::acquireGPUCuda(const edm::HeterogeneousEvent& event, - const edm::EventSetup& setup, - cuda::stream_t<>& cudaStream) { - algo_->kernelWrapper(cudaStream.id()); -} - -void TestGPUConcurrency::produceCPU(edm::HeterogeneousEvent& event, const edm::EventSetup& setup) {} - -void TestGPUConcurrency::produceGPUCuda(edm::HeterogeneousEvent& event, - const edm::EventSetup& setup, - cuda::stream_t<>& cudaStream) {} - -#include "FWCore/Framework/interface/MakerMacros.h" -DEFINE_FWK_MODULE(TestGPUConcurrency); diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrency.h b/HeterogeneousCore/Producer/test/TestGPUConcurrency.h deleted file mode 100644 index e67bc0f4fa6a4..0000000000000 --- a/HeterogeneousCore/Producer/test/TestGPUConcurrency.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef HeterogeneousCore_Producer_test_TestGPUConcurrency_h -#define HeterogeneousCore_Producer_test_TestGPUConcurrency_h - -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -//#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -class TestGPUConcurrencyAlgo; - -/** - * The purpose of this test is to demonstrate running multiple kernels concurrently on a GPU, - * associated to different framework streams on he CPU. - */ -class TestGPUConcurrency - : public HeterogeneousEDProducer> { -public: - explicit TestGPUConcurrency(edm::ParameterSet const& config); - ~TestGPUConcurrency() override = default; - - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - -private: - using OutputType = - HeterogeneousProductImpl, heterogeneous::GPUCudaProduct>; - - void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override; - void acquireGPUCuda(const edm::HeterogeneousEvent& event, - const edm::EventSetup& setup, - cuda::stream_t<>& cudaStream) override; - void produceGPUCuda(edm::HeterogeneousEvent& event, - const edm::EventSetup& setup, - cuda::stream_t<>& cudaStream) override; - void produceCPU(edm::HeterogeneousEvent& event, const edm::EventSetup& setup) override; - - // GPU code -private: - TestGPUConcurrencyAlgo* algo_; - - // data members -private: - unsigned int blocks_; - unsigned int threads_; - unsigned int sleep_; -}; - -#endif // HeterogeneousCore_Producer_test_TestGPUConcurrency_h diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.cu b/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.cu deleted file mode 100644 index 7e67e0de4a196..0000000000000 --- a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.cu +++ /dev/null @@ -1,16 +0,0 @@ -#include -#include - -#include "TestGPUConcurrencyAlgo.h" - -__global__ void kernel(uint32_t sleep) { - volatile int sum = 0; - auto index = threadIdx.x + blockDim.x * blockIdx.x; - if (index < 32) - for (uint32_t i = 0; i < sleep; ++i) - sum += i; -} - -void TestGPUConcurrencyAlgo::kernelWrapper(cudaStream_t stream) const { - kernel<<>>(sleep_); -} diff --git a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.h b/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.h deleted file mode 100644 index 1750a43ca92bf..0000000000000 --- a/HeterogeneousCore/Producer/test/TestGPUConcurrencyAlgo.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef HeterogeneousCore_Producer_test_TestGPUConcurrencyAlgo_h -#define HeterogeneousCore_Producer_test_TestGPUConcurrencyAlgo_h - -#include - -class TestGPUConcurrencyAlgo { -public: - TestGPUConcurrencyAlgo(unsigned int blocks, unsigned int threads, unsigned int sleep) - : blocks_(blocks), threads_(threads), sleep_(sleep) {} - - void kernelWrapper(cudaStream_t stream) const; - - // data members -private: - unsigned int blocks_; - unsigned int threads_; - unsigned int sleep_; -}; - -#endif // HeterogeneousCore_Producer_test_TestGPUConcurrencyAlgo_h diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerAnalyzer.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerAnalyzer.cc deleted file mode 100644 index 43d4facfc7122..0000000000000 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerAnalyzer.cc +++ /dev/null @@ -1,54 +0,0 @@ -#include "FWCore/Framework/interface/global/EDAnalyzer.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/MakerMacros.h" - -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Utilities/interface/transform.h" - -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -#include - -class TestHeterogeneousEDProducerAnalyzer : public edm::global::EDAnalyzer<> { -public: - explicit TestHeterogeneousEDProducerAnalyzer(edm::ParameterSet const& iConfig); - ~TestHeterogeneousEDProducerAnalyzer() override = default; - - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - -private: - void analyze(edm::StreamID streamID, const edm::Event& iEvent, const edm::EventSetup& iSetup) const override; - - using InputType = HeterogeneousProductImpl, - heterogeneous::GPUCudaProduct>>; - std::string label_; - std::vector> srcTokens_; -}; - -TestHeterogeneousEDProducerAnalyzer::TestHeterogeneousEDProducerAnalyzer(const edm::ParameterSet& iConfig) - : label_(iConfig.getParameter("@module_label")), - srcTokens_( - edm::vector_transform(iConfig.getParameter>("src"), - [this](const edm::InputTag& tag) { return consumes(tag); })) {} - -void TestHeterogeneousEDProducerAnalyzer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add>("src", std::vector{}); - descriptions.add("testHeterogeneousEDProducerAnalyzer", desc); -} - -void TestHeterogeneousEDProducerAnalyzer::analyze(edm::StreamID streamID, - const edm::Event& iEvent, - const edm::EventSetup& iSetup) const { - edm::Handle hinput; - int inp = 0; - for (const auto& token : srcTokens_) { - iEvent.getByToken(token, hinput); - edm::LogPrint("TestHeterogeneousEDProducerAnalyzer") - << "Analyzer event " << iEvent.id().event() << " stream " << streamID << " label " << label_ << " coll " << inp - << " result " << hinput->get().getProduct(); - ++inp; - } -} - -DEFINE_FWK_MODULE(TestHeterogeneousEDProducerAnalyzer); diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc deleted file mode 100644 index 415bac567f129..0000000000000 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc +++ /dev/null @@ -1,183 +0,0 @@ -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" - -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/MakerMacros.h" - -#include "FWCore/ParameterSet/interface/ParameterSet.h" - -#include "FWCore/ServiceRegistry/interface/Service.h" -#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -#include "TestHeterogeneousEDProducerGPUHelpers.h" - -#include -#include -#include - -#include -#include - -/** - * The purpose of this test is to demonstrate the following - * - EDProducer implementing an algorithm for CPU and a CUDA GPU - * - How to initialize the GPU algorithm and make once-per-job-per-stream allocations on the device - * - How to read heterogeneous product from event - * - How to write heterogeneous product to event - * * Especially pointers to device memory - */ -class TestHeterogeneousEDProducerGPU - : public HeterogeneousEDProducer> { -public: - explicit TestHeterogeneousEDProducerGPU(edm::ParameterSet const& iConfig); - ~TestHeterogeneousEDProducerGPU() override = default; - - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - -private: - using OutputType = - HeterogeneousProductImpl, - heterogeneous::GPUCudaProduct>; - - void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override; - - void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - cuda::stream_t<>& cudaStream) override; - - void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; - void produceGPUCuda(edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - cuda::stream_t<>& cudaStream) override; - - std::string label_; - edm::EDGetTokenT srcToken_; - - // GPU stuff - std::unique_ptr gpuAlgo_; - TestHeterogeneousEDProducerGPUTask::ResultType gpuOutput_; -}; - -TestHeterogeneousEDProducerGPU::TestHeterogeneousEDProducerGPU(edm::ParameterSet const& iConfig) - : HeterogeneousEDProducer(iConfig), label_(iConfig.getParameter("@module_label")) { - auto srcTag = iConfig.getParameter("src"); - if (!srcTag.label().empty()) { - srcToken_ = consumesHeterogeneous(srcTag); - } - - produces(); -} - -void TestHeterogeneousEDProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("src", edm::InputTag()); - HeterogeneousEDProducer::fillPSetDescription(desc); - descriptions.add("testHeterogeneousEDProducerGPU", desc); -} - -void TestHeterogeneousEDProducerGPU::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { - edm::Service cs; - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label_ << " TestHeterogeneousEDProducerGPU::beginStreamGPUCuda begin stream " << streamId << " device " - << cs->getCurrentDevice(); - - gpuAlgo_ = std::make_unique(); - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label_ << " TestHeterogeneousEDProducerGPU::beginStreamGPUCuda end stream " << streamId << " device " - << cs->getCurrentDevice(); -} - -void TestHeterogeneousEDProducerGPU::acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - cuda::stream_t<>& cudaStream) { - edm::Service cs; - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireGPUCuda begin event " << iEvent.id().event() - << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice(); - - gpuOutput_.first.reset(); - gpuOutput_.second.reset(); - - TestHeterogeneousEDProducerGPUTask::ResultTypeRaw input = std::make_pair(nullptr, nullptr); - if (!srcToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcToken_, hin); - input = *hin; - } - - gpuOutput_ = gpuAlgo_->runAlgo(label_, 0, input, cudaStream); - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireGPUCuda end event " << iEvent.id().event() - << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice(); -} - -void TestHeterogeneousEDProducerGPU::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << label_ << " TestHeterogeneousEDProducerGPU::produceCPU begin event " << iEvent.id().event() << " stream " - << iEvent.streamID(); - - unsigned int input = 0; - if (!srcToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcToken_, hin); - input = *hin; - } - - std::random_device r; - std::mt19937 gen(r()); - auto dist = std::uniform_real_distribution<>(1.0, 3.0); - auto dur = dist(gen); - edm::LogPrint("TestHeterogeneousEDProducerGPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " - << iEvent.streamID() << " will take " << dur << " seconds"; - std::this_thread::sleep_for(std::chrono::seconds(1) * dur); - - const unsigned int output = input + iEvent.streamID() * 100 + iEvent.id().event(); - - iEvent.put(std::make_unique(output)); - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << label_ << " TestHeterogeneousEDProducerGPU::produceCPU end event " << iEvent.id().event() << " stream " - << iEvent.streamID() << " result " << output; -} - -void TestHeterogeneousEDProducerGPU::produceGPUCuda(edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - cuda::stream_t<>& cudaStream) { - edm::Service cs; - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << label_ << " TestHeterogeneousEDProducerGPU::produceGPUCuda begin event " << iEvent.id().event() << " stream " - << iEvent.streamID() << " device " << cs->getCurrentDevice(); - - gpuAlgo_->release(label_, cudaStream); - iEvent.put( - std::make_unique(gpuOutput_.first.get(), - gpuOutput_.second.get()), - [this, - eventId = iEvent.event().id().event(), - streamId = iEvent.event().streamID(), - dev = cs->getCurrentDevice(), - &cudaStream](const TestHeterogeneousEDProducerGPUTask::ResultTypeRaw& src, unsigned int& dst) { - // TODO: try to abstract both the current device setting and the delivery of cuda::stream to this function - // It needs some further thought so I leave it now as it is - // Maybe "per-thread default stream" would help as they are regular CUDA streams (wrt. to the default stream)? - // Or not, because the current device has to be set correctly. - // Maybe we should initiate the transfer in all cases? - cuda::device::current::scoped_override_t<> setDeviceForThisScope(dev); - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label_ << " Copying from GPU to CPU for event " << eventId << " in stream " << streamId; - dst = TestHeterogeneousEDProducerGPUTask::getResult(src, cudaStream); - }); - - // If, for any reason, you want to disable the automatic GPU->CPU transfer, pass heterogeneous::DisableTransfer{} insteads of the function, i.e. - //iEvent.put(std::make_unique(gpuOutput_.first.get(), gpuOutput_.second.get()), heterogeneous::DisableTransfer{}); - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << label_ << " TestHeterogeneousEDProducerGPU::produceGPUCuda end event " << iEvent.id().event() << " stream " - << iEvent.streamID() << " device " << cs->getCurrentDevice(); -} - -DEFINE_FWK_MODULE(TestHeterogeneousEDProducerGPU); diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu deleted file mode 100644 index 38af6518cdef3..0000000000000 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu +++ /dev/null @@ -1,208 +0,0 @@ -#include -#include -#include - -#include "FWCore/MessageLogger/interface/MessageLogger.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" -#include "TestHeterogeneousEDProducerGPUHelpers.h" - -// -// Vector Addition Kernel -// -namespace { - template - __global__ void vectorAdd(const T *a, const T *b, T *c, int numElements) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < numElements) { - c[i] = a[i] + b[i]; - } - } - - template - __global__ void vectorProd(const T *a, const T *b, T *c, int numElements) { - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - - if (row < numElements && col < numElements) { - c[row * numElements + col] = a[row] * b[col]; - } - } - - template - __global__ void matrixMul(const T *a, const T *b, T *c, int numElements) { - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - - if (row < numElements && col < numElements) { - T tmp = 0; - for (int i = 0; i < numElements; ++i) { - tmp += a[row * numElements + i] * b[i * numElements + col]; - } - c[row * numElements + col] = tmp; - } - } - - template - __global__ void matrixMulVector(const T *a, const T *b, T *c, int numElements) { - int row = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < numElements) { - T tmp = 0; - for (int i = 0; i < numElements; ++i) { - tmp += a[row * numElements + i] * b[i]; - } - c[row] = tmp; - } - } -} // namespace - -int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { - // Example from Viktor/cuda-api-wrappers - constexpr int NUM_VALUES = 10000; - - auto current_device = cuda::device::current::get(); - auto stream = current_device.create_stream(cuda::stream::no_implicit_synchronization_with_default_stream); - - auto h_a = cudautils::make_host_unique(NUM_VALUES, nullptr); - auto h_b = cudautils::make_host_unique(NUM_VALUES, nullptr); - auto h_c = cudautils::make_host_unique(NUM_VALUES, nullptr); - - for (auto i = 0; i < NUM_VALUES; i++) { - h_a[i] = input + i; - h_b[i] = i * i; - } - - auto d_a = cudautils::make_device_unique(NUM_VALUES, nullptr); - auto d_b = cudautils::make_device_unique(NUM_VALUES, nullptr); - auto d_c = cudautils::make_device_unique(NUM_VALUES, nullptr); - - cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); - cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); - - int threadsPerBlock{256}; - int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; - - vectorAdd<<>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - /* - // doesn't work with header-only? - cudautils::launch(vectorAdd, {blocksPerGrid, threadsPerBlock}, - d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); - */ - - cudaCheck(cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id())); - - stream.synchronize(); - - int ret = 0; - for (auto i = 0; i < 10; i++) { - ret += h_c[i]; - } - - return ret; -} - -namespace { - constexpr int NUM_VALUES = 10000; -} - -TestHeterogeneousEDProducerGPUTask::TestHeterogeneousEDProducerGPUTask() { - h_a = cudautils::make_host_unique(NUM_VALUES, nullptr); - h_b = cudautils::make_host_unique(NUM_VALUES, nullptr); - - auto current_device = cuda::device::current::get(); - d_b = cudautils::make_device_unique(NUM_VALUES, nullptr); - d_ma = cudautils::make_device_unique(NUM_VALUES * NUM_VALUES, nullptr); - d_mb = cudautils::make_device_unique(NUM_VALUES * NUM_VALUES, nullptr); - d_mc = cudautils::make_device_unique(NUM_VALUES * NUM_VALUES, nullptr); -} - -TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTask::runAlgo( - const std::string &label, int input, const ResultTypeRaw inputArrays, cuda::stream_t<> &stream) { - // First make the sanity check - if (inputArrays.first != nullptr) { - auto h_check = std::make_unique(NUM_VALUES); - cudaCheck(cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost)); - for (int i = 0; i < NUM_VALUES; ++i) { - if (h_check[i] != i) { - throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " - << h_check[i]; - } - } - } - - for (auto i = 0; i < NUM_VALUES; i++) { - h_a[i] = i; - h_b[i] = i * i; - } - - auto current_device = cuda::device::current::get(); - auto d_a = cudautils::make_device_unique(NUM_VALUES, nullptr); - auto d_c = cudautils::make_device_unique(NUM_VALUES, nullptr); - if (inputArrays.second != nullptr) { - d_d = cudautils::make_device_unique(NUM_VALUES, nullptr); - } - - // Create stream - cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); - cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); - - int threadsPerBlock{32}; - int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id(); - vectorAdd<<>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - if (inputArrays.second != nullptr) { - vectorAdd<<>>(inputArrays.second, d_c.get(), d_d.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - std::swap(d_c, d_d); - } - - dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES}; - dim3 blocksPerGrid3{1, 1}; - if (NUM_VALUES * NUM_VALUES > 32) { - threadsPerBlock3.x = 32; - threadsPerBlock3.y = 32; - blocksPerGrid3.x = ceil(double(NUM_VALUES) / double(threadsPerBlock3.x)); - blocksPerGrid3.y = ceil(double(NUM_VALUES) / double(threadsPerBlock3.y)); - } - vectorProd<<>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - vectorProd<<>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - matrixMul<<>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - matrixMulVector<<>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES); - cudaCheck(cudaGetLastError()); - - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label << " GPU kernels launched, returning return pointer device " << current_device.id() - << " CUDA stream " << stream.id(); - return std::make_pair(std::move(d_a), std::move(d_c)); -} - -void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda::stream_t<> &stream) { - // any way to automate the release? - edm::LogPrint("TestHeterogeneousEDProducerGPU") - << " " << label << " GPU releasing temporary memory device " << cuda::stream::associated_device(stream.id()) - << " CUDA stream " << stream.id(); - d_d.reset(); -} - -int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) { - auto h_c = cudautils::make_device_unique(NUM_VALUES, nullptr); - cudaCheck(cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id())); - stream.synchronize(); - - float ret = 0; - for (auto i = 0; i < NUM_VALUES; i++) { - ret += h_c[i]; - } - - return static_cast(ret); -} diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.h b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.h deleted file mode 100644 index 9429f7a343c9e..0000000000000 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef HeterogeneousCore_Producer_TestHeterogneousEDProducerGPUHelpers -#define HeterogeneousCore_Producer_TestHeterogneousEDProducerGPUHelpers - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -#include -#include -#include -#include - -int TestHeterogeneousEDProducerGPUHelpers_simple_kernel(int input); - -class TestHeterogeneousEDProducerGPUTask { -public: - TestHeterogeneousEDProducerGPUTask(); - ~TestHeterogeneousEDProducerGPUTask() = default; - - using Ptr = cudautils::device::unique_ptr; - using PtrRaw = Ptr::pointer; - - using ResultType = std::pair; - using ResultTypeRaw = std::pair; - using ConstResultTypeRaw = std::pair; - - ResultType runAlgo(const std::string& label, int input, const ResultTypeRaw inputArrays, cuda::stream_t<>& stream); - void release(const std::string& label, cuda::stream_t<>& stream); - static int getResult(const ResultTypeRaw& d_ac, cuda::stream_t<>& stream); - -private: - std::unique_ptr> streamPtr; - - // stored for the job duration - cudautils::host::unique_ptr h_a; - cudautils::host::unique_ptr h_b; - cudautils::device::unique_ptr d_b; - cudautils::device::unique_ptr d_ma; - cudautils::device::unique_ptr d_mb; - cudautils::device::unique_ptr d_mc; - - // temporary storage, need to be somewhere to allow async execution - cudautils::device::unique_ptr d_d; -}; - -#endif diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc deleted file mode 100644 index bb7c8401ff2f0..0000000000000 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc +++ /dev/null @@ -1,179 +0,0 @@ -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" - -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/MakerMacros.h" - -#include "FWCore/ParameterSet/interface/ParameterSet.h" - -#include "tbb/concurrent_vector.h" - -#include -#include -#include -#include - -/** - * The purpose of this test is to demonstrate the following - * - EDProducer implementing an algorithm for CPU and a (mock GPU) device - * * The mock device exercises all the structures without a need for actual device - * - How to read heterogeneous product from event - * - How to read normal product from event - * - How to write heterogeneous product to event - */ -class TestHeterogeneousEDProducerGPUMock - : public HeterogeneousEDProducer > { -public: - explicit TestHeterogeneousEDProducerGPUMock(edm::ParameterSet const& iConfig); - ~TestHeterogeneousEDProducerGPUMock() override = default; - - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - -private: - using OutputType = - HeterogeneousProductImpl, heterogeneous::GPUMockProduct >; - - std::string label_; - edm::EDGetTokenT srcToken_; - edm::EDGetTokenT srcIntToken_; - - // hack for GPU mock - tbb::concurrent_vector > pendingFutures; - - // simulating GPU memory - unsigned int gpuOutput_; - - void acquireGPUMock(const edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - std::function callback) override; - - void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; - void produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; -}; - -TestHeterogeneousEDProducerGPUMock::TestHeterogeneousEDProducerGPUMock(edm::ParameterSet const& iConfig) - : HeterogeneousEDProducer(iConfig), label_(iConfig.getParameter("@module_label")) { - auto srcTag = iConfig.getParameter("src"); - if (!srcTag.label().empty()) { - srcToken_ = consumesHeterogeneous(srcTag); - } - auto srcIntTag = iConfig.getParameter("srcInt"); - if (!srcIntTag.label().empty()) { - srcIntToken_ = consumes(srcIntTag); - } - - produces(); - produces(); - produces("foo"); -} - -void TestHeterogeneousEDProducerGPUMock::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("src", edm::InputTag()); - desc.add("srcInt", edm::InputTag()); - HeterogeneousEDProducer::fillPSetDescription(desc); - descriptions.add("testHeterogeneousEDProducerGPUMock", desc); -} - -void TestHeterogeneousEDProducerGPUMock::acquireGPUMock(const edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup, - std::function callback) { - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << " " << label_ << " TestHeterogeneousEDProducerGPUMock::acquireGPUMock begin event " << iEvent.id().event() - << " stream " << iEvent.streamID(); - - unsigned int input = 0; - if (!srcToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcToken_, hin); - input = *hin; - } - if (!srcIntToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcIntToken_, hin); - input += *hin; - } - - /// GPU work - std::random_device r; - std::mt19937 gen(r()); - auto dist = std::uniform_real_distribution<>(0.1, 1.0); - auto dur = dist(gen); - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << " " << label_ << " Task (GPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() - << " will take " << dur << " seconds"; - - auto ret = std::async( - std::launch::async, - [this, dur, input, callback = std::move(callback), eventId = iEvent.id().event(), streamId = iEvent.streamID()]() { - std::this_thread::sleep_for(std::chrono::seconds(1) * dur); - gpuOutput_ = input + streamId * 100 + eventId; - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << " " << label_ << " TestHeterogeneousEDProducerGPUMock::acquireGPUMock finished async for event " - << eventId << " stream " << streamId; - callback(); - }); - pendingFutures.push_back(std::move(ret)); - - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << " " << label_ << " TestHeterogeneousEDProducerGPUMock::acquireGPUMock end event " << iEvent.id().event() - << " stream " << iEvent.streamID(); -} - -void TestHeterogeneousEDProducerGPUMock::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << label_ << " TestHeterogeneousEDProducerGPUMock::produceCPU begin event " << iEvent.id().event() << " stream " - << iEvent.streamID(); - - unsigned int input = 0; - if (!srcToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcToken_, hin); - input = *hin; - } - if (!srcIntToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcIntToken_, hin); - input += *hin; - } - - std::random_device r; - std::mt19937 gen(r()); - auto dist = std::uniform_real_distribution<>(1.0, 3.0); - auto dur = dist(gen); - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur - << " seconds"; - std::this_thread::sleep_for(std::chrono::seconds(1) * dur); - - const unsigned int output = input + iEvent.streamID() * 100 + iEvent.id().event(); - - iEvent.put(std::make_unique(output)); - iEvent.put(std::make_unique(1)); - iEvent.put(std::make_unique(2), "foo"); - - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << label_ << " TestHeterogeneousEDProducerGPUMock::produceCPU end event " << iEvent.id().event() << " stream " - << iEvent.streamID() << " result " << output; -} - -void TestHeterogeneousEDProducerGPUMock::produceGPUMock(edm::HeterogeneousEvent& iEvent, - const edm::EventSetup& iSetup) { - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << label_ << " TestHeterogeneousEDProducerGPUMock::produceGPUMock begin event " << iEvent.id().event() - << " stream " << iEvent.streamID(); - - iEvent.put( - std::make_unique(gpuOutput_), - [this, eventId = iEvent.id().event(), streamId = iEvent.streamID()](const unsigned int& src, unsigned int& dst) { - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << " " << label_ << " Task (GPU) for event " << eventId << " in stream " << streamId << " copying to CPU"; - dst = src; - }); - iEvent.put(std::make_unique(2)); - - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") - << label_ << " TestHeterogeneousEDProducerGPUMock::produceGPUMock end event " << iEvent.id().event() << " stream " - << iEvent.streamID() << " result " << gpuOutput_; -} - -DEFINE_FWK_MODULE(TestHeterogeneousEDProducerGPUMock); diff --git a/HeterogeneousCore/Producer/test/testGPUConcurrency.py b/HeterogeneousCore/Producer/test/testGPUConcurrency.py deleted file mode 100644 index af2f83374a41a..0000000000000 --- a/HeterogeneousCore/Producer/test/testGPUConcurrency.py +++ /dev/null @@ -1,29 +0,0 @@ -import FWCore.ParameterSet.Config as cms - -process = cms.Process('RECO') - -process.load('FWCore.MessageService.MessageLogger_cfi') -process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi') - -# Empty source -process.source = cms.Source("EmptySource") - -process.maxEvents = cms.untracked.PSet( - input = cms.untracked.int32(10) -) - -process.options = cms.untracked.PSet( - numberOfThreads = cms.untracked.uint32( 4 ), - numberOfStreams = cms.untracked.uint32( 4 ), -) - -# Path and EndPath definitions -from HeterogeneousCore.Producer.testGPUConcurrency_cfi import testGPUConcurrency -process.testGPU = testGPUConcurrency.clone() -process.testGPU.sleep = 1000000 -process.testGPU.blocks = 100000 -process.testGPU.threads = 256 - -process.path = cms.Path(process.testGPU) - -process.schedule = cms.Schedule(process.path) diff --git a/HeterogeneousCore/Producer/test/testGPUMock_cfg.py b/HeterogeneousCore/Producer/test/testGPUMock_cfg.py deleted file mode 100644 index 8bfcf031a53c4..0000000000000 --- a/HeterogeneousCore/Producer/test/testGPUMock_cfg.py +++ /dev/null @@ -1,36 +0,0 @@ -import FWCore.ParameterSet.Config as cms - -process = cms.Process("Test") -process.load("FWCore.MessageService.MessageLogger_cfi") - -process.source = cms.Source("EmptySource") - -process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) ) - -process.options = cms.untracked.PSet( -# numberOfThreads = cms.untracked.uint32(4), - numberOfStreams = cms.untracked.uint32(0) -) - - -#process.Tracer = cms.Service("Tracer") -process.prod1 = cms.EDProducer('TestHeterogeneousEDProducerGPUMock') -process.prod2 = cms.EDProducer('TestHeterogeneousEDProducerGPUMock', - src = cms.InputTag("prod1") -) -process.prod3 = cms.EDProducer('TestHeterogeneousEDProducerGPUMock', - srcInt = cms.InputTag("prod1") -) - -#process.t = cms.Task(process.prod1, process.prod2) - -process.eca = cms.EDAnalyzer("EventContentAnalyzer", - getData = cms.untracked.bool(True), - getDataForModuleLabels = cms.untracked.vstring("producer"), - listContent = cms.untracked.bool(True), -) -process.p = cms.Path(process.prod1+process.prod2+process.prod3)#+process.eca) -#process.p.associate(process.t) - -# Example of forcing module to run a specific device for one module via configuration -#process.prod1.heterogeneousEnabled_ = cms.untracked.PSet(force = cms.untracked.string("GPUMock")) diff --git a/HeterogeneousCore/Producer/test/testGPU_cfg.py b/HeterogeneousCore/Producer/test/testGPU_cfg.py deleted file mode 100644 index f9c90ca41b6e5..0000000000000 --- a/HeterogeneousCore/Producer/test/testGPU_cfg.py +++ /dev/null @@ -1,32 +0,0 @@ -import FWCore.ParameterSet.Config as cms - -process = cms.Process("Test") -process.load("FWCore.MessageService.MessageLogger_cfi") - -process.source = cms.Source("EmptySource") - -process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) ) - -process.options = cms.untracked.PSet( -# numberOfThreads = cms.untracked.uint32(4), - numberOfStreams = cms.untracked.uint32(0) -) - -from HeterogeneousCore.Producer.testHeterogeneousEDProducerGPU_cfi import testHeterogeneousEDProducerGPU as prod - -#process.Tracer = cms.Service("Tracer") -process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi") -process.prod1 = prod.clone() -process.prod2 = prod.clone(src = "prod1") -process.prod3 = prod.clone(src = "prod1") -process.prod4 = prod.clone() -process.ana = cms.EDAnalyzer("TestHeterogeneousEDProducerAnalyzer", - src = cms.VInputTag("prod2", "prod3", "prod4") -) - -process.t = cms.Task(process.prod1, process.prod2, process.prod3, process.prod4) -process.p = cms.Path(process.ana) -process.p.associate(process.t) - -# Example of disabling CUDA device type for one module via configuration -#process.prod4.heterogeneousEnabled_.GPUCuda = False diff --git a/HeterogeneousCore/Product/BuildFile.xml b/HeterogeneousCore/Product/BuildFile.xml deleted file mode 100644 index 0e6323ac9009b..0000000000000 --- a/HeterogeneousCore/Product/BuildFile.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h b/HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h deleted file mode 100644 index febf02ecf073d..0000000000000 --- a/HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef HeterogeneousCore_Product_HeterogeneousDeviceId_h -#define HeterogeneousCore_Product_HeterogeneousDeviceId_h - -/** - * Enumerator for heterogeneous device types - */ -enum class HeterogeneousDevice { kCPU = 0, kGPUMock, kGPUCuda, kSize }; - -namespace heterogeneous { - template - struct HeterogeneousDeviceTag { - constexpr static HeterogeneousDevice value = Device; - }; -} // namespace heterogeneous - -/** - * Class to represent an identifier for a heterogeneous device. - * Contains device type and an integer identifier. - */ -class HeterogeneousDeviceId { -public: - constexpr static auto kInvalidDevice = HeterogeneousDevice::kSize; - - HeterogeneousDeviceId() : deviceType_(kInvalidDevice), deviceId_(0) {} - explicit HeterogeneousDeviceId(HeterogeneousDevice device, unsigned int id = 0) - : deviceType_(device), deviceId_(id) {} - - HeterogeneousDevice deviceType() const { return deviceType_; } - - unsigned int deviceId() const { return deviceId_; } - -private: - HeterogeneousDevice deviceType_; - unsigned int deviceId_; -}; - -#endif diff --git a/HeterogeneousCore/Product/interface/HeterogeneousProduct.h b/HeterogeneousCore/Product/interface/HeterogeneousProduct.h deleted file mode 100644 index 6607cb89feb2a..0000000000000 --- a/HeterogeneousCore/Product/interface/HeterogeneousProduct.h +++ /dev/null @@ -1,377 +0,0 @@ -#ifndef HeterogeneousCore_Product_HeterogeneousProduct_h -#define HeterogeneousCore_Product_HeterogeneousProduct_h - -#include "FWCore/Utilities/interface/Exception.h" - -#include "HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProductBase.h" - -#include -#include -#include -#include - -namespace heterogeneous { - template - std::string bitsetArrayToString(const T& bitsetArray) { - std::string ret; - for (const auto& bitset : bitsetArray) { - ret += bitset.to_string() + " "; - } - return ret; - } - - /** - * The *Product templates are to specify in a generic way which - * data locations and device-specific types the - * HeterogeneousProduct<> supports. - * - * Helper functions are provided to infer the type from input - * arguments to ease the construction of HeterogeneousProduct<> - * - * TODO: try to simplify... - */ - - // Mapping from *Product to HeterogeneousDevice enumerator - template - struct ProductToEnum {}; - -#define DEFINE_DEVICE_PRODUCT(ENUM) \ - template \ - class ENUM##Product { \ - public: \ - using DataType = T; \ - static constexpr const HeterogeneousDevice tag = HeterogeneousDevice::k##ENUM; \ - ENUM##Product() = default; \ - ENUM##Product(T&& data) : data_(std::move(data)) {} \ - const T& product() const { return data_; } \ - T& product() { return data_; } \ - \ - private: \ - T data_; \ - }; \ - template \ - struct ProductToEnum> { \ - static constexpr const HeterogeneousDevice value = HeterogeneousDevice::k##ENUM; \ - } - - DEFINE_DEVICE_PRODUCT(CPU); - DEFINE_DEVICE_PRODUCT(GPUMock); - DEFINE_DEVICE_PRODUCT(GPUCuda); -#undef DEFINE_DEVICE_PRODUCT - - // Tag class to allow disabling automatic device->CPU transfers - struct DisableTransfer {}; - - /** - * Below are various helpers - * - * TODO: move to an inner namespace (e.g. detail, impl)?, possibly to a separate file - */ - - // Empty struct for tuple defitionons - struct Empty {}; - - // Metaprogram to return the *Product type for a given enumerator if it exists in Types... pack - template - struct IfInPack; - - template - struct IfInPack { - using type = std::conditional_t::type>; - }; - template - struct IfInPack { - using type = Empty; - }; - - template - using IfInPack_t = typename IfInPack::type; - - // Metaprogram to construct the callback function type for device->CPU transfers - template - struct CallBackType { - using type = std::function; - }; - template - struct CallBackType { - using type = Empty; - }; - template - using CallBackType_t = typename CallBackType::type; - - // Metaprogram to get an element from a tuple, or Empty if out of bounds - template - struct TupleElement { - using type = Empty; - }; - template - struct TupleElement::value)>::type> { - using type = std::tuple_element_t; - }; - template - using TupleElement_t = typename TupleElement::type; - - // Metaprogram to loop over two tuples and an array of bitsets (of - // equal length), and if any element of bitset is set to true call a - // function from one of the tuples with arguments from the second - // tuple - template - struct CallFunctionIf { - static bool call(const FunctionTuple& functionTuple, ProductTuple& productTuple, const BitSetArray& bitsetArray) { - constexpr const auto index = std::tuple_size::value - sizeMinusIndex; - if (bitsetArray[index].any()) { - const auto& func = std::get(functionTuple); - if (!func) { - throw cms::Exception("Assert") << "Attempted to call transfer-to-CPU function for device " << index - << " but the std::function object is not valid!"; - } - func(std::get(productTuple).product(), std::get<0>(productTuple).product()); - return true; - } - return CallFunctionIf, - sizeMinusIndex - 1>::call(functionTuple, productTuple, bitsetArray); - } - }; - template - struct CallFunctionIf { - static bool call(const FunctionTuple& functionTuple, ProductTuple& productTuple, const BitSetArray& bitsetArray) { - constexpr const auto index = std::tuple_size::value - sizeMinusIndex; - return CallFunctionIf, - sizeMinusIndex - 1>::call(functionTuple, productTuple, bitsetArray); - } - }; - template - struct CallFunctionIf { - static bool call(const FunctionTuple& functionTuple, ProductTuple& productTuple, const BitSetArray& bitsetArray) { - return false; - } - }; - - // Metaprogram to specialize getProduct() for CPU - template - struct GetOrTransferProduct { - template - static const auto& getProduct(const FunctionTuple& functionTuple, - ProductTuple& productTuple, - const BitSetArray& location) { - constexpr const auto index = static_cast(device); - if (!location[index].any()) { - throw cms::Exception("LogicError") - << "Called getProduct() for device " << index << " but the data is not there! Location bitfield is " - << bitsetArrayToString(location); - } - return std::get(productTuple).product(); - } - }; - - template <> - struct GetOrTransferProduct { - template - static const auto& getProduct(const FunctionTuple& functionTuple, - ProductTuple& productTuple, - BitSetArray& location) { - constexpr const auto index = static_cast(HeterogeneousDevice::kCPU); - if (!location[index].any()) { - auto found = - CallFunctionIf, - std::tuple_size::value - 1>::call(functionTuple, productTuple, location); - if (!found) { - throw cms::Exception("LogicError") - << "Attempted to transfer data to CPU, but the data is not available anywhere! Location bitfield is " - << bitsetArrayToString(location); - } - } - location[index].set(0); - return std::get(productTuple).product(); - } - }; - - // Metaprogram to return DataType or Empty - template - struct DataTypeOrEmpty { - using type = typename T::DataType; - }; - template <> - struct DataTypeOrEmpty { - using type = Empty; - }; -} // namespace heterogeneous - -/** - * Generic data product for holding data on CPU or a heterogeneous - * device which keeps track where the data is. Data can be - * automatically transferred from the device to CPU when data is - * requested on CPU but does not exist there (yet). - * - * TODO: - * * extend transfers to device->device (within a single device type) - */ -template -class HeterogeneousProductImpl : public HeterogeneousProductBase { - using ProductTuple = std::tuple, - heterogeneous::IfInPack_t>; - using TransferToCPUTuple = - std::tuple(HeterogeneousDevice::kGPUMock), ProductTuple>>, - heterogeneous::CallBackType_t< - CPUProduct, - std::tuple_element_t(HeterogeneousDevice::kGPUCuda), ProductTuple>>>; - // Some sanity checks - static_assert(std::tuple_size::value == std::tuple_size::value, "Size mismatch"); - static_assert(std::tuple_size::value == static_cast(HeterogeneousDevice::kSize), - "Size mismatch"); - -public: - template - struct CanGet { - using FromType = typename heterogeneous::DataTypeOrEmpty< - std::tuple_element_t(Device), ProductTuple>>::type; - static const bool value = std::is_same::value; - }; - - template - struct CanPut { - using ToType = typename heterogeneous::DataTypeOrEmpty< - std::tuple_element_t(Device), ProductTuple>>::type; - static const bool value = std::is_same::value; - }; - - HeterogeneousProductImpl() = default; - ~HeterogeneousProductImpl() override = default; - HeterogeneousProductImpl(HeterogeneousProductImpl&& other) { - std::lock(mutex_, other.mutex_); - std::lock_guard lk1(mutex_, std::adopt_lock); - std::lock_guard lk2(other.mutex_, std::adopt_lock); - - products_ = std::move(other.products_); - transfersToCPU_ = std::move(other.transfersToCPU_); - location_ = std::move(other.location_); - } - HeterogeneousProductImpl& operator=(HeterogeneousProductImpl&& other) { - std::lock(mutex_, other.mutex_); - std::lock_guard lk1(mutex_, std::adopt_lock); - std::lock_guard lk2(other.mutex_, std::adopt_lock); - - products_ = std::move(other.products_); - transfersToCPU_ = std::move(other.transfersToCPU_); - location_ = std::move(other.location_); - return *this; - } - - // Constructor for CPU data - template - HeterogeneousProductImpl(heterogeneous::HeterogeneousDeviceTag, D&& data) { - static_assert(Device == HeterogeneousDevice::kCPU, "This overload allows only CPU device"); - constexpr const auto index = static_cast(HeterogeneousDevice::kCPU); - std::get(products_) = std::move(data); - location_[index].set(0); - } - - /** - * Generic constructor for device data. A function to transfer the - * data to CPU has to be provided as well. - */ - template - HeterogeneousProductImpl(heterogeneous::HeterogeneousDeviceTag, - D&& data, - HeterogeneousDeviceId location, - F transferToCPU) { - constexpr const auto index = static_cast(Device); - assert(location.deviceType() == Device); - std::get(products_) = std::move(data); - std::get(transfersToCPU_) = std::move(transferToCPU); - location_[index].set(location.deviceId()); - } - - /** - * Generic constructor for device data, but without the transfer function(!). - */ - template - HeterogeneousProductImpl(heterogeneous::HeterogeneousDeviceTag, - D&& data, - HeterogeneousDeviceId location, - heterogeneous::DisableTransfer) { - // TODO: try to avoid code duplication between the other device data - constexpr const auto index = static_cast(Device); - assert(location.deviceType() == Device); - std::get(products_) = std::move(data); - location_[index].set(location.deviceId()); - } - - template - const auto& getProduct() const { - constexpr const auto index = static_cast(device); - static_assert(!std::is_same, heterogeneous::Empty>::value, - "This HeterogeneousProduct does not support this type"); - - // Locking the mutex here is quite "conservative" - // Writes happen only if the "device" is CPU and the data is elsewhere - std::lock_guard lk(mutex_); - return heterogeneous::GetOrTransferProduct::getProduct(transfersToCPU_, products_, location_); - } - -private: - mutable ProductTuple products_; - TransferToCPUTuple transfersToCPU_; -}; - -/** - * The main purpose of the HeterogeneousProduct, - * HeterogeneousProductBase, HeterogeneousProductImpl<...> class - * hierarchy is to avoid the dictionary generation for the concrete - * HeterogeneousProductImpl<...>'s. - */ -class HeterogeneousProduct { -public: - HeterogeneousProduct() = default; - - template - HeterogeneousProduct(HeterogeneousProductImpl&& impl) { - impl_.reset(static_cast(new HeterogeneousProductImpl(std::move(impl)))); - } - - HeterogeneousProduct(HeterogeneousProduct&&) = default; - HeterogeneousProduct& operator=(HeterogeneousProduct&&) = default; - - ~HeterogeneousProduct() = default; - - bool isNonnull() const { return static_cast(impl_); } - bool isNull() const { return !isNonnull(); } - - const HeterogeneousProductBase* getBase() const { return impl_.get(); } - - template - const T& get() const { - if (isNull()) - throw cms::Exception("LogicError") << "HerogeneousProduct is null"; - - const auto& ref = *impl_; - if (typeid(T) != typeid(ref)) { - throw cms::Exception("LogicError") << "Trying to get HeterogeneousProductImpl " << typeid(T).name() - << " but the product contains " << typeid(ref).name(); - } - return static_cast(*impl_); - } - -private: - std::unique_ptr impl_; -}; - -#endif diff --git a/HeterogeneousCore/Product/interface/HeterogeneousProductBase.h b/HeterogeneousCore/Product/interface/HeterogeneousProductBase.h deleted file mode 100644 index b3a952749054e..0000000000000 --- a/HeterogeneousCore/Product/interface/HeterogeneousProductBase.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef HeterogeneousCore_Product_HeterogeneousProductBase_h -#define HeterogeneousCore_Product_HeterogeneousProductBase_h - -#include "HeterogeneousCore/Product/interface/HeterogeneousDeviceId.h" - -#include -#include -#include - -namespace heterogeneous { - constexpr const unsigned int kMaxDevices = 16; - using DeviceBitSet = std::bitset; -} // namespace heterogeneous - -// For type erasure to ease dictionary generation -class HeterogeneousProductBase { -public: - // TODO: Given we'll likely have the data on one or at most a couple - // of devices, storing the information in a "dense" bit pattern may - // be overkill. Maybe a "sparse" presentation would be sufficient - // and easier to deal with? - using BitSet = heterogeneous::DeviceBitSet; - using BitSetArray = std::array(HeterogeneousDevice::kSize)>; - - virtual ~HeterogeneousProductBase() = 0; - - bool isProductOn(HeterogeneousDevice loc) const { - // should this be protected with the mutex? - return location_[static_cast(loc)].any(); - } - BitSet onDevices(HeterogeneousDevice loc) const { - // should this be protected with the mutex? - return location_[static_cast(loc)]; - } - -protected: - mutable std::mutex mutex_; - mutable BitSetArray location_; -}; - -#endif diff --git a/HeterogeneousCore/Product/src/HeterogeneousProduct.cc b/HeterogeneousCore/Product/src/HeterogeneousProduct.cc deleted file mode 100644 index 38154e7431a97..0000000000000 --- a/HeterogeneousCore/Product/src/HeterogeneousProduct.cc +++ /dev/null @@ -1,3 +0,0 @@ -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -HeterogeneousProductBase::~HeterogeneousProductBase() {} diff --git a/HeterogeneousCore/Product/src/classes.h b/HeterogeneousCore/Product/src/classes.h deleted file mode 100644 index 42d64ad68252b..0000000000000 --- a/HeterogeneousCore/Product/src/classes.h +++ /dev/null @@ -1,9 +0,0 @@ -#include "DataFormats/Common/interface/Wrapper.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" -#include - -namespace { - struct dictionary { - HeterogeneousProduct hp; - }; -} // namespace diff --git a/HeterogeneousCore/Product/src/classes_def.xml b/HeterogeneousCore/Product/src/classes_def.xml deleted file mode 100644 index 3e4263b7905f6..0000000000000 --- a/HeterogeneousCore/Product/src/classes_def.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/HeterogeneousCore/Product/test/BuildFile.xml b/HeterogeneousCore/Product/test/BuildFile.xml deleted file mode 100644 index dbcab64bc0023..0000000000000 --- a/HeterogeneousCore/Product/test/BuildFile.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/HeterogeneousCore/Product/test/testHeterogeneousProduct.cpp b/HeterogeneousCore/Product/test/testHeterogeneousProduct.cpp deleted file mode 100644 index f9b51f9d90b65..0000000000000 --- a/HeterogeneousCore/Product/test/testHeterogeneousProduct.cpp +++ /dev/null @@ -1,303 +0,0 @@ -#include - -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -class testHeterogeneousProduct : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(testHeterogeneousProduct); - CPPUNIT_TEST(testDefault); - CPPUNIT_TEST(testCPU); - CPPUNIT_TEST(testGPUMock); - CPPUNIT_TEST(testGPUCuda); - CPPUNIT_TEST(testGPUAll); - CPPUNIT_TEST(testMoveGPUMock); - CPPUNIT_TEST(testMoveGPUCuda); - CPPUNIT_TEST(testProduct); - CPPUNIT_TEST_SUITE_END(); - -public: - void setUp() override {} - void tearDown() override {} - - void testDefault(); - void testCPU(); - void testGPUMock(); - void testGPUCuda(); - void testGPUAll(); - void testMoveGPUMock(); - void testMoveGPUCuda(); - void testProduct(); -}; - -///registration of the test so that the runner can find it -CPPUNIT_TEST_SUITE_REGISTRATION(testHeterogeneousProduct); - -void testHeterogeneousProduct::testDefault() { - HeterogeneousProductImpl, heterogeneous::GPUMockProduct > prod; - - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - - CPPUNIT_ASSERT_THROW(prod.getProduct(), cms::Exception); - CPPUNIT_ASSERT_THROW(prod.getProduct(), cms::Exception); -} - -void testHeterogeneousProduct::testCPU() { - HeterogeneousProductImpl, heterogeneous::GPUMockProduct > prod{ - heterogeneous::HeterogeneousDeviceTag(), 5}; - - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - - CPPUNIT_ASSERT(prod.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod.getProduct(), cms::Exception); -} - -void testHeterogeneousProduct::testGPUMock() { - HeterogeneousProductImpl, heterogeneous::GPUMockProduct > prod{ - heterogeneous::HeterogeneousDeviceTag(), - 5, - HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0), - [](const int& src, int& dst) { dst = src; }}; - - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - - CPPUNIT_ASSERT(prod.onDevices(HeterogeneousDevice::kGPUMock)[0] == true); - CPPUNIT_ASSERT(prod.onDevices(HeterogeneousDevice::kGPUMock)[1] == false); - - CPPUNIT_ASSERT(prod.getProduct() == 5); - - // Automatic transfer - CPPUNIT_ASSERT(prod.getProduct() == 5); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod.getProduct() == 5); -} - -void testHeterogeneousProduct::testGPUCuda() { - HeterogeneousProductImpl, heterogeneous::GPUCudaProduct > prod{ - heterogeneous::HeterogeneousDeviceTag(), - 5, - HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, 1), - [](const int& src, int& dst) { dst = src; }}; - - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - - CPPUNIT_ASSERT(prod.onDevices(HeterogeneousDevice::kGPUCuda)[0] == false); - CPPUNIT_ASSERT(prod.onDevices(HeterogeneousDevice::kGPUCuda)[1] == true); - CPPUNIT_ASSERT(prod.onDevices(HeterogeneousDevice::kGPUCuda)[2] == false); - - CPPUNIT_ASSERT(prod.getProduct() == 5); - - // Automatic transfer - CPPUNIT_ASSERT(prod.getProduct() == 5); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - CPPUNIT_ASSERT(prod.getProduct() == 5); -} - -void testHeterogeneousProduct::testGPUAll() { - // Data initially on CPU - HeterogeneousProductImpl, - heterogeneous::GPUMockProduct, - heterogeneous::GPUCudaProduct > - prod1{heterogeneous::HeterogeneousDeviceTag(), 5}; - - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - - CPPUNIT_ASSERT(prod1.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod1.getProduct(), cms::Exception); - CPPUNIT_ASSERT_THROW(prod1.getProduct(), cms::Exception); - - // Data initially on GPUMock - HeterogeneousProductImpl, - heterogeneous::GPUMockProduct, - heterogeneous::GPUCudaProduct > - prod2{heterogeneous::HeterogeneousDeviceTag(), - 5, - HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0), - [](const int& src, int& dst) { dst = src; }}; - - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - - CPPUNIT_ASSERT(prod2.onDevices(HeterogeneousDevice::kGPUMock)[0] == true); - CPPUNIT_ASSERT(prod2.onDevices(HeterogeneousDevice::kGPUMock)[1] == false); - - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - - // Automatic transfer - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - - // Data initially on GPUCuda - HeterogeneousProductImpl, - heterogeneous::GPUMockProduct, - heterogeneous::GPUCudaProduct > - prod3{heterogeneous::HeterogeneousDeviceTag(), - 5, - HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, 2), - [](const int& src, int& dst) { dst = src; }}; - - CPPUNIT_ASSERT(prod3.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod3.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod3.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - - CPPUNIT_ASSERT(prod3.onDevices(HeterogeneousDevice::kGPUCuda)[0] == false); - CPPUNIT_ASSERT(prod3.onDevices(HeterogeneousDevice::kGPUCuda)[1] == false); - CPPUNIT_ASSERT(prod3.onDevices(HeterogeneousDevice::kGPUCuda)[2] == true); - CPPUNIT_ASSERT(prod3.onDevices(HeterogeneousDevice::kGPUCuda)[3] == false); - - CPPUNIT_ASSERT_THROW(prod3.getProduct(), cms::Exception); - CPPUNIT_ASSERT(prod3.getProduct() == 5); - - // Automatic transfer - CPPUNIT_ASSERT(prod3.getProduct() == 5); - CPPUNIT_ASSERT(prod3.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod3.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod3.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - CPPUNIT_ASSERT_THROW(prod3.getProduct(), cms::Exception); - CPPUNIT_ASSERT(prod3.getProduct() == 5); -} - -void testHeterogeneousProduct::testMoveGPUMock() { - // Data initially on GPUMock - using Type = HeterogeneousProductImpl, - heterogeneous::GPUMockProduct, - heterogeneous::GPUCudaProduct >; - Type prod1{heterogeneous::HeterogeneousDeviceTag(), - 5, - HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0), - [](const int& src, int& dst) { dst = src; }}; - Type prod2; - - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod1.onDevices(HeterogeneousDevice::kGPUMock)[0] == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod2.onDevices(HeterogeneousDevice::kGPUMock).none() == true); - - CPPUNIT_ASSERT(prod1.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod1.getProduct(), cms::Exception); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - - // move - prod2 = std::move(prod1); - - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod2.onDevices(HeterogeneousDevice::kGPUMock)[0] == true); - - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - - // automatic transfer - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); -} - -void testHeterogeneousProduct::testMoveGPUCuda() { - // Data initially on GPUCuda - using Type = HeterogeneousProductImpl, - heterogeneous::GPUMockProduct, - heterogeneous::GPUCudaProduct >; - Type prod1{heterogeneous::HeterogeneousDeviceTag(), - 5, - HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, 3), - [](const int& src, int& dst) { dst = src; }}; - Type prod2; - - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod1.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - CPPUNIT_ASSERT(prod1.onDevices(HeterogeneousDevice::kGPUCuda)[3] == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == false); - CPPUNIT_ASSERT(prod2.onDevices(HeterogeneousDevice::kGPUCuda).none() == true); - - CPPUNIT_ASSERT_THROW(prod1.getProduct(), cms::Exception); - CPPUNIT_ASSERT(prod1.getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - - // move - prod2 = std::move(prod1); - - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - CPPUNIT_ASSERT(prod2.onDevices(HeterogeneousDevice::kGPUCuda)[3] == true); - - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - CPPUNIT_ASSERT(prod2.getProduct() == 5); - - // automatic transfer - CPPUNIT_ASSERT(prod2.getProduct() == 5); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kCPU) == true); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUMock) == false); - CPPUNIT_ASSERT(prod2.isProductOn(HeterogeneousDevice::kGPUCuda) == true); - CPPUNIT_ASSERT_THROW(prod2.getProduct(), cms::Exception); - CPPUNIT_ASSERT(prod2.getProduct() == 5); -} - -void testHeterogeneousProduct::testProduct() { - using Type1 = HeterogeneousProductImpl, heterogeneous::GPUMockProduct >; - using Type2 = HeterogeneousProductImpl, heterogeneous::GPUCudaProduct >; - - Type1 data1{heterogeneous::HeterogeneousDeviceTag(), 5}; - Type2 data2{heterogeneous::HeterogeneousDeviceTag(), 10}; - - HeterogeneousProduct prod{}; - CPPUNIT_ASSERT(prod.isNull() == true); - CPPUNIT_ASSERT(prod.isNonnull() == false); - CPPUNIT_ASSERT_THROW(prod.get(), cms::Exception); - - HeterogeneousProduct prod1{std::move(data1)}; - CPPUNIT_ASSERT(prod1.isNull() == false); - CPPUNIT_ASSERT(prod1.isNonnull() == true); - CPPUNIT_ASSERT(prod1.get().getProduct() == 5); - CPPUNIT_ASSERT_THROW(prod1.get(), cms::Exception); - - HeterogeneousProduct prod2{std::move(data2)}; - CPPUNIT_ASSERT(prod2.isNull() == false); - CPPUNIT_ASSERT(prod2.isNonnull() == true); - CPPUNIT_ASSERT_THROW(prod2.get(), cms::Exception); - CPPUNIT_ASSERT(prod2.get().getProduct() == 10); - - prod1 = std::move(prod2); - CPPUNIT_ASSERT_THROW(prod1.get(), cms::Exception); - CPPUNIT_ASSERT(prod1.get().getProduct() == 10); - - prod = std::move(prod1); - CPPUNIT_ASSERT(prod.isNull() == false); - CPPUNIT_ASSERT(prod.isNonnull() == true); - CPPUNIT_ASSERT_THROW(prod.get(), cms::Exception); - CPPUNIT_ASSERT(prod.get().getProduct() == 10); -} - -#include diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml index 546aa92692b9e..1acd271a17e00 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml @@ -1,19 +1,17 @@ - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml index 0868b38c07652..49be86daa18cd 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml @@ -1,13 +1,11 @@ + - - - diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc index cff4f61004def..a4f19ac276a7a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc @@ -1,12 +1,12 @@ #include #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" -#include "CUDADataFormats/Common/interface/HostProduct.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -20,7 +20,6 @@ #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" class SiPixelRecHitFromSOA : public edm::stream::EDProducer { diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml index 8c0261ee0d999..ecfbd99b667fc 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml @@ -1,8 +1,6 @@ - - - + diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index 3f4013c196f07..cd143fb3aab2c 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -1,28 +1,26 @@ #include #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" #include "FWCore/Framework/interface/global/EDAnalyzer.h" -#include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/Utilities/interface/InputTag.h" #include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" - class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { public: explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 2d6da6a631151..3e73cfd7a4e96 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -2,22 +2,20 @@ #include "CUDADataFormats/Common/interface/CUDAProduct.h" #include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" -#include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/Framework/interface/stream/EDProducer.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" - -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" class PixelTrackSoAFromCUDA : public edm::stream::EDProducer { public: diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml index 1554e515ad437..cacfe4662b51f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml @@ -1,17 +1,15 @@ + - - - diff --git a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml index 0c2d73dc13a12..2f89a8e353778 100644 --- a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml @@ -1,24 +1,20 @@ - + + + + + + + - - - - - - - - - - - - - - - + + + + + diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc index 8008af34855ce..7a1bd064e5a2b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc @@ -1,23 +1,21 @@ #include -#include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" -#include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/Framework/interface/stream/EDProducer.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" - -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" class PixelVertexSoAFromCUDA : public edm::stream::EDProducer { public: diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml index c10ee14dc3638..26ef004940306 100644 --- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml +++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml @@ -1,14 +1,12 @@ + + - - - - - - - - - - - + + + + + + + diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index 29a67c255ee1b..5c5e5fecf41cf 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -1,37 +1,35 @@ +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" -#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" -#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" #include "DataFormats/GeometrySurface/interface/Plane.h" -#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" #include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h" +#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" #include "FWCore/Framework/interface/global/EDProducer.h" -#include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/Utilities/interface/InputTag.h" #include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/Utilities/interface/EDGetToken.h" -#include "Geometry/Records/interface/TrackerTopologyRcd.h" -#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "FWCore/Utilities/interface/InputTag.h" #include "Geometry/CommonDetUnit/interface/GeomDet.h" -#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" #include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h" #include "TrackingTools/Records/interface/TrackingComponentsRecord.h" -#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" -#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" #include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" #include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" - /* produces seeds directly from cuda produced tuples */ diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml index 896893c28a468..1a01ad8e00979 100644 --- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml @@ -1,13 +1,10 @@ + - - - - diff --git a/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc index 2c87ad818e185..83fbde7a7fa2d 100644 --- a/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc +++ b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc @@ -2,24 +2,22 @@ #include "CUDADataFormats/Common/interface/CUDAProduct.h" #include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" #include "FWCore/Framework/interface/global/EDAnalyzer.h" -#include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/Utilities/interface/InputTag.h" #include "FWCore/PluginManager/interface/ModuleDef.h" #include "FWCore/ServiceRegistry/interface/Service.h" #include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" - #include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h" class ClusterTPCUDAdump : public edm::global::EDAnalyzer<> {