-
Notifications
You must be signed in to change notification settings - Fork 3
/
OclWrapper.h
283 lines (260 loc) · 7.91 KB
/
OclWrapper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/*
* (c) 2011 Wim Vanderbauwhede <wim.vanderbauwhede@gmail.com>
*
* */
#ifndef __OCLWRAPPER_H__
#define __OCLWRAPPER_H__
#ifndef OCLV2
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#else
// OpenCL v1.2
#include <vector>
#endif
#ifdef OCLV22
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
#endif
#ifdef OSX
// On OS X headers are in non-standard location
#include <cl.hpp>
#ifdef OCLV2
//#ifndef OCLV22
#include <OclKernelFunctor.h>
//#endif
#endif
#else
// Not OS X, i.e. Linux
#ifdef OCLV2
#ifdef OCLV22
#include <CL/cl2.hpp>
#else
#include <CL/cl.hpp>
#endif
//#ifndef OCLV22
// OpenCL v1.2
#include <OclKernelFunctor.h>
//#endif
#else
#ifndef FPGA
#include <cl.hpp>
#else
// For Altera
#include <CL/cl.hpp>
#include <OclKernelFunctor.h>
#endif // FPGA
#endif
#endif
#include <sys/time.h>
//#include <cstdio>
//#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
#include <cstdlib>
//#include <iterator>
//#include <Timing.h>
#include <DeviceInfo.h>
#include <PlatformInfo.h>
#define OPENCL_TIMINGS
#define NBUFS 16
void checkErr(cl_int err, const char * name);
inline double wsecond();
typedef unsigned int uint;
enum DeviceType { CPU, GPU, ACC };
#if MRMODE==0
#define CL_MEM_READ_MODE CL_MEM_COPY_HOST_PTR
#elif MRMODE==1
#define CL_MEM_READ_MODE (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)
#elif MRMODE==2
// Seems this is problematic!
#define CL_MEM_READ_MODE CL_MEM_USE_HOST_PTR
#else
#define CL_MEM_READ_MODE CL_MEM_COPY_HOST_PTR // default
#endif
class OclWrapper {
private:
std::string kernelsource;
#ifdef OCLV2
std::vector<cl::Platform> platformList;
#else
cl::vector<cl::Platform> platformList;
#endif
// This assumes a single context, a single device, a single program and a single kernel
// cl::vector<cl::Device> devices;
const char* kernel_opts;
// int platformIdx, deviceIdx;
bool useCPU;
bool useGPU;
bool useACC;
cl_int err; // error code returned from API calls
void getContextAndDevices();
void getDevices();
// For the Fortran interface, but we could use this approach in C/C++ as well
void initArgStatus();
public:
#ifdef OCLV2
std::vector<cl::Device> devices;
#else
cl::vector<cl::Device> devices;
#endif
cl::Context* context_p;
#ifndef FPGA_MULTI_KERNEL
cl::Kernel* kernel_p;
cl::Kernel kernel;
#else
std::vector<cl::Kernel*> kernel_ps;
std::vector<cl::Kernel> kernels;
#endif
cl::Program* program_p;
#ifndef OCLV2
cl::KernelFunctor runKernel;
cl::KernelFunctor kernel_functor;
#else
//#ifndef OCLV22
// cl::KernelFunctor in v2.x is very different
// cl::KernelFunctor runKernel;
OclKernelFunctor runKernel;
//#endif
#endif
#ifndef FPGA_MULTI_KERNEL
cl::CommandQueue* queue_p;
cl::CommandQueue queue;
#else
std::vector<cl::CommandQueue*> queue_ps;
std::vector<cl::CommandQueue> queues;
#endif
cl::Buffer* buf[NBUFS];
cl::Buffer* buf_p;
int nPlatforms;
DeviceInfo deviceInfo;
int platformIdx, deviceIdx;
#ifdef PLATINFO
PlatformInfo platformInfo;
#endif
int ncalls;
// For the Fortran interface, but we could use this approach in C/C++ as well
int argStatus[NBUFS];
std::ostringstream kernelOpts;
OclWrapper ();
OclWrapper (int deviceIdx);
OclWrapper (int deviceIdx, int platformIdx);
OclWrapper (const char* ksource, const char* kname, const char* kopts="",int devIdx=-1,int platIdx=-1);
void initOclWrapper(const char* ksource, const char* kname, const char* kopts="");
bool hasCPU(int pIdx);
bool hasGPU(int pIdx);
bool hasACC(int pIdx);
void showDeviceInfo();
int nDevices(int pIdx, std::string dev);
void selectDevice(int platformIdx, int deviceIdx, DeviceType devt);
void selectDevice(int platformIdx, int deviceIdx);
void selectDevice(int deviceIdx);
void selectDevice();
void selectCPU();
void selectGPU();
void selectACC();
void setKernelOpts();
void buildProgram(const char* ksource,const char* opts);
void reloadKernel(const char* kname);
void loadKernel(const char* kname);
void loadKernel(const char* ksource, const char* kname);
void loadKernel(const char* ksource, const char* kname, const char* opts);
//#ifndef OCLV2
void loadBinary(const char* ksource);
void storeBinary(const char* ksource);
//#endif
int getMaxComputeUnits();
int getGlobalMemCacheType();
unsigned long int getGlobalMemSize();
unsigned long int getLocalMemSize();
wv_size_t getPreferredWorkGroupSizeMultiple();
wv_size_t getNThreadsHint();
wv_size_t getWorkGroupSize();
cl::Buffer& makeWriteBuffer( int bufSize );
// cl::Buffer* makeStaticWriteBuffer( int idx,int bufSize );
void makeWriteBufferPos(int argpos, int bufSize );
cl::Buffer& makeReadBuffer(int bufSize, void* hostBuf = NULL, cl_mem_flags flags = CL_MEM_READ_ONLY );
cl::Buffer& makeReadWriteBuffer(int bufSize, void* hostBuf = NULL, cl_mem_flags flags = CL_MEM_READ_WRITE);
cl::Buffer& makeReadBuffer(int bufSize, const void* hostBuf , cl_mem_flags flags = CL_MEM_READ_ONLY );
cl::Buffer& makeReadWriteBuffer(int bufSize, const void* hostBuf , cl_mem_flags flags = CL_MEM_READ_WRITE);
// cl::Buffer* makeStaticReadBuffer(int idx,int bufSize, void* hostBuf = NULL, cl_mem_flags flags = CL_MEM_READ_ONLY );
void makeReadBufferPos(int argpos, int bufSize);
void createQueue();
void setArg(unsigned int idx, const cl::Buffer& buf);
void setArg(unsigned int idx, const int buf);
void setArg(unsigned int idx, const float buf);
//#ifndef OCLV22
int enqueueNDRangeOffset(const cl::NDRange& = cl::NDRange(0),const cl::NDRange& = cl::NDRange(1),const cl::NDRange& = cl::NullRange);
//#endif
//#ifndef OCLV22
int enqueueNDRange(const cl::NDRange& = cl::NDRange(1),const cl::NDRange& = cl::NullRange);
//#endif
int enqueueNDRangeRun(const cl::NDRange& = cl::NDRange(1),const cl::NDRange& = cl::NullRange);
float enqueueNDRangeRun(unsigned int = 1, unsigned int = 0);
void readBuffer(const cl::Buffer& deviceBuf, int bufSize, void* hostBuf);
void readBuffer(const cl::Buffer& deviceBuf, int bufSize, const void* hostBuf);
void readBuffer(
const cl::Buffer& buffer,
bool blocking_read,
::wv_size_t offset,
::wv_size_t size,
void * ptr,
#ifndef OCLV22
const VECTOR_CLASS<cl::Event> * events = NULL,
#else
const std::vector<cl::Event> * events = NULL,
#endif
cl::Event * event = NULL);
void readBuffer(
const cl::Buffer& buffer,
bool blocking_read,
::wv_size_t offset,
::wv_size_t size,
const void * ptr,
#ifndef OCLV22
const VECTOR_CLASS<cl::Event> * events = NULL,
#else
const std::vector<cl::Event> * events = NULL,
#endif
cl::Event * event = NULL);
// void readStaticBuffer(int idx, int bufSize, void* hostBuf);
void readBufferPos(int argpos, int bufSize, void* hostBuf);
void writeBuffer(const cl::Buffer& deviceBuf, int bufSize, void* hostBuf);
void writeBuffer(
const cl::Buffer& deviceBuf,
bool blocking_write,
::wv_size_t offset,
::wv_size_t size,
void * ptr,
#ifndef OCLV22
const VECTOR_CLASS<cl::Event> * events = NULL,
#else
const std::vector<cl::Event> * events = NULL,
#endif
cl::Event * event = NULL);
void writeBuffer(const cl::Buffer& deviceBuf, int bufSize, const void* hostBuf);
void writeBuffer(
const cl::Buffer& deviceBuf,
bool blocking_write,
::wv_size_t offset,
::wv_size_t size,
const void * ptr,
#ifndef OCLV22
const VECTOR_CLASS<cl::Event> * events = NULL,
#else
const std::vector<cl::Event> * events = NULL,
#endif
cl::Event * event = NULL);
void writeBufferPos(int argpos, int bufSize, void* hostBuf);
#ifdef OPENCL_TIMINGS
double getExecutionTime (const cl::Event& event);
#endif
};
double wsecond()
{
struct timeval sampletime;
double time;
gettimeofday( &sampletime, NULL );
time = sampletime.tv_sec + (sampletime.tv_usec / 1000000.0);
return( time*1000.0 ); // return time in ms
}
#endif // __OCLWRAPPER_H__