-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy pathoclkernels.cl
436 lines (382 loc) · 14.8 KB
/
oclkernels.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
/* Kernel file for OpenCL kernels. Includes all subfunctions of the Matvec routine as OpenCL kernels
*
* Copyright (C) ADDA contributors
* This file is part of ADDA.
*
* ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with ADDA. If not, see
* <http://www.gnu.org/licenses/>.
*/
// Somehow current AMD drivers do not define CL_VERSION_1_0 when compiling OpenCL kernels
//#ifndef CL_VERSION_1_0
//# error "OpenCL version at least 1.0 is required"
//#endif
//enable printf for debugging purposes on amd devices
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#ifdef USE_DOUBLE
# ifdef DOUBLE_AMD
# pragma OPENCL EXTENSION cl_amd_fp64 : enable
# else
# pragma OPENCL EXTENSION cl_khr_fp64 : enable
# endif
#endif
// defines type to work with variables defined as size_t in the main ADDA code
#ifdef SIZET_UINT
typedef uint in_sizet;
#elif defined(SIZET_ULONG)
typedef ulong in_sizet;
#else
# error "size_t alternative is not defined"
#endif
//======================================================================================================================
// functions used in kernels
void cMult(__constant double2 *a,__global const double2 *b,__global double2 *c)
// complex multiplication; c=ab; !!! c should be different from a and b !!!
{
(*c).s0=(*a).s0*(*b).s0 - (*a).s1*(*b).s1;
(*c).s1=(*a).s1*(*b).s0 + (*a).s0*(*b).s1;
}
//======================================================================================================================
void cMult2(__constant double2 *a,__global const double2 *b,double2 *c)
// complex multiplication; c=ab; !!! c should be different from a and b !!! c is private
{
(*c).s0=(*a).s0*(*b).s0 - (*a).s1*(*b).s1;
(*c).s1=(*a).s1*(*b).s0 + (*a).s0*(*b).s1;
}
//======================================================================================================================
double cvNorm2(__global const double2 *a)
// square of the norm of a complex vector[3]
{
return ( a[0].s0*a[0].s0 + a[0].s1*a[0].s1 + a[1].s0*a[1].s0 + a[1].s1*a[1].s1
+ a[2].s0*a[2].s0 + a[2].s1*a[2].s1 );
}
//======================================================================================================================
// Arith1 kernels
__kernel void nConj(__global double2 *a)
{
const size_t id=get_global_id(0);
a[id].s1= -a[id].s1;
}
//======================================================================================================================
__kernel void clzero(__global double2 *input)
{
const size_t id = get_global_id(0);
input[id] = 0.0;
}
//======================================================================================================================
__kernel void arith1(__global const uchar *material,__global const ushort *position,__constant double2 *cc_sqrt,
__global const double2 *argvec, __global double2 *Xmatrix,const in_sizet local_Nsmall,const in_sizet smallY,
const in_sizet gridX)
{
const size_t id=get_global_id(0);
const size_t j=3*id;
const uchar mat=material[id];
size_t index;
int xcomp;
index = ((position[j+2]*smallY+position[j+1])*gridX+position[j]);
for (xcomp=0;xcomp<3;xcomp++) cMult(&cc_sqrt[mat*3+xcomp],&argvec[j+xcomp],&Xmatrix[index+xcomp*local_Nsmall]);
}
//======================================================================================================================
// Arith2 kernel
__kernel void arith2(__global const double2 *Xmatrix,__global double2 *slices,const in_sizet gridZ,
const in_sizet smallY,const in_sizet gridX,const in_sizet gridYZ,const in_sizet local_Nsmall)
{
const size_t xa=get_global_id(0);
const size_t x=get_global_id(0)-get_global_offset(0);
const size_t y=get_global_id(2);
const size_t z=get_global_id(1);
const size_t local_gridX=get_global_size(0);
size_t i;
size_t j;
int xcomp;
i = y*gridZ+z+x*gridYZ;
j = (z*smallY+y)*gridX+xa;
for (xcomp=0;xcomp<3;xcomp++) {
barrier(CLK_GLOBAL_MEM_FENCE);
slices[i+xcomp*gridYZ*local_gridX]=Xmatrix[j+xcomp*local_Nsmall];
}
}
//======================================================================================================================
// Arith3 kernels and functions
void cSymMatrVec(const double2 *matr,const double2 *vec,double2 *res)
// multiplication of complex symmetric matrix[6] by complex vec[3]; res=matr.vec
{
res[0].s0 = matr[0].s0*vec[0].s0 - matr[0].s1*vec[0].s1
+ matr[1].s0*vec[1].s0 - matr[1].s1*vec[1].s1
+ matr[2].s0*vec[2].s0 - matr[2].s1*vec[2].s1;
res[0].s1 = matr[0].s0*vec[0].s1 + matr[0].s1*vec[0].s0
+ matr[1].s0*vec[1].s1 + matr[1].s1*vec[1].s0
+ matr[2].s0*vec[2].s1 + matr[2].s1*vec[2].s0;
res[1].s0 = matr[1].s0*vec[0].s0 - matr[1].s1*vec[0].s1
+ matr[3].s0*vec[1].s0 - matr[3].s1*vec[1].s1
+ matr[4].s0*vec[2].s0 - matr[4].s1*vec[2].s1;
res[1].s1 = matr[1].s0*vec[0].s1 + matr[1].s1*vec[0].s0
+ matr[3].s0*vec[1].s1 + matr[3].s1*vec[1].s0
+ matr[4].s0*vec[2].s1 + matr[4].s1*vec[2].s0;
res[2].s0 = matr[2].s0*vec[0].s0 - matr[2].s1*vec[0].s1
+ matr[4].s0*vec[1].s0 - matr[4].s1*vec[1].s1
+ matr[5].s0*vec[2].s0 - matr[5].s1*vec[2].s1;
res[2].s1 = matr[2].s0*vec[0].s1 + matr[2].s1*vec[0].s0
+ matr[4].s0*vec[1].s1 + matr[4].s1*vec[1].s0
+ matr[5].s0*vec[2].s1 + matr[5].s1*vec[2].s0;
}
//======================================================================================================================
void cReflMatrVec(const double2 *matr,const double2 *vec,double2 *res)
/* multiplication of matrix[6] by complex vec[3]; res=matr.vec; passed components are the same as for symmetric matrix:
* 11,12,13,22,23,33, but the matrix has the following symmetry - M21=M12, M31=-M13, M32=-M23
*/
{
res[0].s0 = matr[0].s0*vec[0].s0 - matr[0].s1*vec[0].s1
+ matr[1].s0*vec[1].s0 - matr[1].s1*vec[1].s1
+ matr[2].s0*vec[2].s0 - matr[2].s1*vec[2].s1;
res[0].s1 = matr[0].s0*vec[0].s1 + matr[0].s1*vec[0].s0
+ matr[1].s0*vec[1].s1 + matr[1].s1*vec[1].s0
+ matr[2].s0*vec[2].s1 + matr[2].s1*vec[2].s0;
res[1].s0 = matr[1].s0*vec[0].s0 - matr[1].s1*vec[0].s1
+ matr[3].s0*vec[1].s0 - matr[3].s1*vec[1].s1
+ matr[4].s0*vec[2].s0 - matr[4].s1*vec[2].s1;
res[1].s1 = matr[1].s0*vec[0].s1 + matr[1].s1*vec[0].s0
+ matr[3].s0*vec[1].s1 + matr[3].s1*vec[1].s0
+ matr[4].s0*vec[2].s1 + matr[4].s1*vec[2].s0;
res[2].s0 = - matr[2].s0*vec[0].s0 + matr[2].s1*vec[0].s1
- matr[4].s0*vec[1].s0 + matr[4].s1*vec[1].s1
+ matr[5].s0*vec[2].s0 - matr[5].s1*vec[2].s1;
res[2].s1 = - matr[2].s0*vec[0].s1 - matr[2].s1*vec[0].s0
- matr[4].s0*vec[1].s1 - matr[4].s1*vec[1].s0
+ matr[5].s0*vec[2].s1 + matr[5].s1*vec[2].s0;
}
//======================================================================================================================
void cvAdd(const double2 *a, const double2 *b,double2 *c)
// adding two vectors c=a+b; addition is supported by opencl vector types
{
c[0] = a[0] + b[0];
c[1] = a[1] + b[1];
c[2] = a[2] + b[2];
}
//======================================================================================================================
__kernel void arith3(__global double2 *slices_tr,__global const double2 *Dmatrix,const in_sizet smallY,
const in_sizet smallZ,const in_sizet gridX,const in_sizet DsizeY,const in_sizet DsizeZ,const char NDCOMP,
const char reduced_FFT,const char transposed)
{
size_t const y = get_global_id(0);
size_t const z = get_global_id(1);
// xl (local) is the index for the slices
size_t const xl = get_global_id(2)-get_global_offset(2);
// xa is the x index for Dmatrix
size_t xa=get_global_id(2);
size_t const gridY = get_global_size(0);
size_t const gridZ = get_global_size(1);
size_t const local_gridX = get_global_size(2);
double2 xv[3];
double2 yv[3];
double2 fmat[6];
int Xcomp;
const size_t i=z *gridY + y; // indexSliceZY
size_t j;
size_t ya=y;
size_t za=z;
// works, because of the double2 vector type
for (Xcomp=0;Xcomp<3;Xcomp++) {
barrier(CLK_GLOBAL_MEM_FENCE);
xv[Xcomp]=slices_tr[i+(xl+Xcomp*local_gridX)*gridY*gridZ];
}
// indexDmatrix_mv
if (transposed==1) { // almost never happens
if (xa>0) xa=gridX-xa;
if (y>0) ya=gridY-y;
if (z>0) za=gridZ-z;
}
else {
if (y>=DsizeY) ya=gridY-y;
if (z>=DsizeZ) za=gridZ-z;
}
j=NDCOMP*(xa*DsizeY*DsizeZ+za*DsizeY+ya);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int f=0;f<6;f++) fmat[f]=Dmatrix[j+f];
if (reduced_FFT==1) {
if (y>smallY) {
fmat[1]*=-1;
if (z>smallZ) fmat[2]*=-1;
else fmat[4]*=-1;
}
else if (z>smallZ) {
fmat[2]*=-1;
fmat[4]*=-1;
}
}
cSymMatrVec(fmat,xv,yv); // yv=fmat*xv
for (Xcomp=0;Xcomp<3;Xcomp++) {
barrier(CLK_GLOBAL_MEM_FENCE);
slices_tr[i+(xl+Xcomp*local_gridX)*gridY*gridZ]=yv[Xcomp];
}
}
//======================================================================================================================
__kernel void arith3_surface(__global double2 *slices_tr,__global const double2 *Dmatrix,const in_sizet smallY,
const in_sizet smallZ,const in_sizet gridX,const in_sizet DsizeY,const in_sizet DsizeZ,const char NDCOMP,
const char reduced_FFT,const char transposed,const in_sizet RsizeY,__global double2 *slicesR_tr,
__global const double2 *Rmatrix)
{
size_t const y = get_global_id(0);
size_t const z = get_global_id(1);
size_t const x = get_global_id(2)-get_global_offset(2);
size_t const gridY = get_global_size(0);
size_t const gridZ = get_global_size(1);
size_t const local_gridX = get_global_size(2);
double2 xv[3];
double2 yv[3];
double2 xvR[3];
double2 yvR[3];
double2 fmat[6];
int Xcomp;
const size_t i=z *gridY + y; // indexSliceZY
size_t j;
//offset is needed for xa since it is used to calculate the index to Dmatrix
size_t xa=x+get_global_offset(2);
size_t ya=y;
size_t za=z;
// works, because of the double2 vector type
for (Xcomp=0;Xcomp<3;Xcomp++) {
barrier(CLK_GLOBAL_MEM_FENCE);
xv[Xcomp]=slices_tr[i+x*gridY*gridZ+Xcomp*gridY*gridZ*local_gridX];
}
// indexDmatrix_mv
if (transposed==1) { // almost never happens
if (x>0) xa=gridX-x;
if (y>0) ya=gridY-y;
if (z>0) za=gridZ-z;
}
else {
if (y>=DsizeY) ya=gridY-y;
if (z>=DsizeZ) za=gridZ-z;
}
j=NDCOMP*(xa*DsizeY*DsizeZ+za*DsizeY+ya);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int f=0;f<6;f++) fmat[f]=Dmatrix[j+f];
if (reduced_FFT==1) {
if (y>smallY) {
fmat[1]*=-1;
if (z>smallZ) fmat[2]*=-1;
else fmat[4]*=-1;
}
else if (z>smallZ) {
fmat[2]*=-1;
fmat[4]*=-1;
}
}
cSymMatrVec(fmat,xv,yv); // yv=fmat*xv
// surface part
for (Xcomp=0;Xcomp<3;Xcomp++) {
barrier(CLK_GLOBAL_MEM_FENCE);
xvR[Xcomp]=slicesR_tr[i+x*gridY*gridZ+Xcomp*gridY*gridZ*local_gridX];
}
// indexRmatrix_mv; first resetting indices
xa=x+get_global_offset(2);
ya=y;
za=z;
if (transposed==1) { // almost never happens
if ((x)>0) xa=gridX-x;
else xa=x;
if (y>0) ya=gridY-y;
else ya=y;
}
else {
if (y>=RsizeY) ya=gridY-y;
else ya=y;
}
j=NDCOMP*((xa*gridZ+za )*RsizeY+ya);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int f=0;f<6;f++) fmat[f]=Rmatrix[j+f];
if (reduced_FFT==1 && y>=RsizeY) {
fmat[1]*=-1;
fmat[4]*=-1;
}
if (transposed) { // almost never happens
fmat[2]*=-1;
fmat[4]*=-1;
}
// yv+=fmat.xvR
cReflMatrVec(fmat,xvR,yvR);
cvAdd(yvR,yv,yv);
// surface part finished
for (Xcomp=0;Xcomp<3;Xcomp++) {
barrier(CLK_GLOBAL_MEM_FENCE);
slices_tr[i+x*gridY*gridZ+Xcomp*gridY*gridZ*local_gridX]=yv[Xcomp];
}
}
//======================================================================================================================
// Arith4 kernel
__kernel void arith4(__global double2 *Xmatrix,__global const double2 *slices,const in_sizet gridZ,
const in_sizet smallY,const in_sizet gridX,const in_sizet gridYZ,const in_sizet local_Nsmall)
{
const size_t xa =get_global_id(0);
const size_t x =get_global_id(0)-get_global_offset(0);
const size_t z =get_global_id(1);
const size_t y =get_global_id(2);
const size_t local_gridX=get_global_size(0);
size_t i;
size_t j;
int xcomp;
double2 tmp;
i = y*gridZ+z+x*gridYZ;
j = (z*smallY+y)*gridX+xa;
for (xcomp=0;xcomp<3;xcomp++) {
tmp=slices[i+xcomp*gridYZ*local_gridX];
barrier(CLK_GLOBAL_MEM_FENCE);
Xmatrix[j+xcomp*local_Nsmall]=tmp;
}
}
//======================================================================================================================
// Arith5 kernel
__kernel void arith5(__global const uchar *material,__global const ushort *position,__constant double2 *cc_sqrt,
__global const double2 *argvec,__global const double2 *Xmatrix,const in_sizet local_Nsmall,const in_sizet smallY,
const in_sizet gridX,__global double2 *resultvec)
{
const size_t id = get_global_id(0);
const size_t j=3*id;
const uchar mat = material[id];
size_t index;
double2 temp;
int xcomp;
index = ((position[j+2]*smallY+position[j+1])*gridX+position[j]);
for (xcomp=0;xcomp<3;xcomp++) {
cMult2(&cc_sqrt[mat*3+xcomp],&Xmatrix[index+xcomp*local_Nsmall],&temp);
resultvec[j+xcomp]=argvec[j+xcomp]+temp;
}
}
//======================================================================================================================
__kernel void inpr(__global double *inprod, __global const double2 *resultvec)
{
const size_t id = get_global_id(0);
inprod[id]=cvNorm2(resultvec+(id*3));
}
//======================================================================================================================
// Optimized transpose kernel
// This corresponds to value of tblock in TransposeYZ() in fft.c
#define BLOCK_DIM 16
__kernel void transposeo(__global const double2 *idata,__global double2 *odata,const in_sizet width,
const in_sizet height,__local double2 *block)
//optimised transpose kernel with cache and removed bank conflicts obtained from Nvidia SDK samples
{
// read tiles into local memory
size_t xIndex = get_global_id(0);
size_t yIndex = get_global_id(1);
const size_t zIndex = get_global_id(2);
const size_t htw = height*width;
if ((xIndex < width) && (yIndex < height)) {
size_t index_in = yIndex * width + xIndex + htw * zIndex;
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
// write transposed tile back to global memory
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
if ((xIndex < height) && (yIndex < width)) {
size_t index_out = yIndex * height + xIndex + htw * zIndex;
odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
}
}