-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaccess-gpu.cu
127 lines (111 loc) · 2.92 KB
/
access-gpu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <stdio.h>
#include <assert.h>
#include <sys/time.h>
#include "access.h"
// number of blocks (8 optimal)
#define GRID_DIM 8
// number of threads per block (64 optimal)
#define BLOCK_DIM 64
// total number of gpu threads
#define NTHREADS (GRID_DIM * BLOCK_DIM)
#define CUDA_CHECK() assert(cudaGetLastError() == cudaSuccess)
int arrChunked[ARR_SIZE];
int arrStriped[ARR_SIZE];
/*
Unbalanced, old version of chunked:
The final thread may execute half the work, in the worst case.
*/
__global__ void chunkedLazy(int *arr_d){
int id = threadIdx.x + blockIdx.x * blockDim.x;
int i = (ARR_SIZE / NTHREADS) * id;
int iMax;
if(id == NTHREADS - 1){
iMax = ARR_SIZE;
} else {
iMax = i + (ARR_SIZE / NTHREADS);
}
while(i < iMax){
arr_d[i] = id;
i++;
}
}
/*
Balanced version of chunked:
(ARR_SIZE % NTHREADS) threads make ((ARR_SIZE / NTHREADS) + 1) access,
while the rest make (ARR_SIZE / NTHREADS) accesses.
*/
__global__ void chunked(int *arr_d){
int id = threadIdx.x + blockIdx.x * blockDim.x;
int i; // start
int iMax; // end (+ 1)
if(id < (ARR_SIZE % NTHREADS)){ // do 1 extra
i = ((ARR_SIZE / NTHREADS) + 1) * id;
iMax = i + ((ARR_SIZE / NTHREADS) + 1);
} else { // don't do extra
i = ((ARR_SIZE / NTHREADS) * id) + (ARR_SIZE % NTHREADS);
iMax = i + (ARR_SIZE / NTHREADS);
}
while(i < iMax){
arr_d[i] = id;
i++;
}
}
__global__ void striped(int *arr_d){
int id = threadIdx.x + blockIdx.x * blockDim.x;
int i = id;
while(i < ARR_SIZE){
arr_d[i] = id;
i += NTHREADS;
}
}
int main(){
printf("Running access-gpu...\n");
printf("(GRID_DIM=%d) (BLOCK_DIM=%d) (ARR_SIZE=%d)\n", GRID_DIM, BLOCK_DIM, ARR_SIZE);
printf("\n");
struct timeval t_0, t_f;
double tChunked, tStriped;
int *arrChunked_d, *arrStriped_d;
int size = ARR_SIZE * sizeof(int);
cudaMalloc((void **) &arrChunked_d, size);
CUDA_CHECK();
cudaMalloc((void **) &arrStriped_d, size);
CUDA_CHECK();
// bad gpu access
printf("Begin chunked access...\n");
gettimeofday(&t_0, 0);
chunked<<<GRID_DIM, BLOCK_DIM>>>(arrChunked_d);
CUDA_CHECK();
cudaDeviceSynchronize();
CUDA_CHECK();
gettimeofday(&t_f, 0);
tChunked = TIME(t_0, t_f);
printf("Chunked access complete in %lfs.\n", tChunked);
printf("\n");
// good gpu access
printf("Begin striped access...\n");
gettimeofday(&t_0, 0);
striped<<<GRID_DIM, BLOCK_DIM>>>(arrStriped_d);
CUDA_CHECK();
cudaDeviceSynchronize();
CUDA_CHECK();
gettimeofday(&t_f, 0);
tStriped = TIME(t_0, t_f);
printf("Striped access complete in %lfs.\n", tStriped);
printf("\n");
printf("tChunked/tStriped: %lf\n", tChunked / tStriped);
printf("\n");
cudaMemcpy(arrChunked, arrChunked_d, size, cudaMemcpyDeviceToHost);
CUDA_CHECK();
cudaMemcpy(arrStriped, arrStriped_d, size, cudaMemcpyDeviceToHost);
CUDA_CHECK();
cudaFree(arrChunked_d);
CUDA_CHECK();
cudaFree(arrStriped_d);
CUDA_CHECK();
// arrays are now available on host
if(PRINTING){
printArr(arrChunked);
printArr(arrStriped);
}
return 0;
}