gpu
This commit is contained in:
parent
8b3bb9c382
commit
d976cfaf74
37 changed files with 2669 additions and 371 deletions
|
@ -3,76 +3,97 @@
|
|||
//
|
||||
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
||||
//
|
||||
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char *file, int line) {
|
||||
if(code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] "
|
||||
<< cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// step 01
|
||||
// return the linear index corresponding to the element at row i and column j
|
||||
// in a matrix of size rows x cols, using row-major storage
|
||||
//
|
||||
__device__ int linear_index(int i, int j, int rows, int cols) {
|
||||
|
||||
if (i >= rows)
|
||||
return -1;
|
||||
if (j >= cols)
|
||||
return -1;
|
||||
return i * cols + j;
|
||||
}
|
||||
|
||||
//
|
||||
// step 02
|
||||
// CUDA kernel add
|
||||
//
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
constexpr int rows = 200;
|
||||
constexpr int cols = 80;
|
||||
int* x = (int*)malloc(rows*cols*sizeof(int));
|
||||
int* y = (int*)malloc(rows*cols*sizeof(int));
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
x[i] = i;
|
||||
y[i] = std::pow(-1,i) * i;
|
||||
}
|
||||
|
||||
//
|
||||
// step 03
|
||||
//
|
||||
int* dx;
|
||||
int* dy;
|
||||
// 1. allocate on device
|
||||
|
||||
// 2. copy from host to device
|
||||
|
||||
// 3. launch CUDA kernel
|
||||
// const dim3 threads_per_bloc{32,32,1};
|
||||
|
||||
// 4. copy result from device to host
|
||||
|
||||
// 5. free device memory
|
||||
|
||||
|
||||
|
||||
// checking results
|
||||
bool ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const int expected_result = std::pow(-1,i) * i + i;
|
||||
if(y[i] != expected_result) {
|
||||
std::cout << "Failure" << std::endl;
|
||||
std::cout << "Result at index i="
|
||||
<< i << ": expected "
|
||||
<< std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
// CUDA kernel add
|
||||
__global__ void add(const int* dx, int* dy, int rows, int cols) {
|
||||
auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
|
||||
auto index = linear_index(j, i, rows, cols);
|
||||
if (index == -1)
|
||||
return;
|
||||
auto res = dx[index] + dy[index];
|
||||
dy[index] = res;
|
||||
}
|
||||
|
||||
int main() {
|
||||
constexpr int rows = 200;
|
||||
constexpr int cols = 80;
|
||||
int* x = (int*)malloc(rows * cols * sizeof(int));
|
||||
int* y = (int*)malloc(rows * cols * sizeof(int));
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
x[i] = i;
|
||||
y[i] = std::pow(-1, i) * i;
|
||||
}
|
||||
|
||||
//
|
||||
// step 03
|
||||
//
|
||||
int* dx;
|
||||
int* dy;
|
||||
// 1. allocate on device
|
||||
auto size = rows * cols * sizeof(int);
|
||||
CUDA_CHECK(cudaMalloc(&dx, size));
|
||||
CUDA_CHECK(cudaMalloc(&dy, size));
|
||||
|
||||
// 2. copy from host to device
|
||||
CUDA_CHECK(cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice));
|
||||
|
||||
// 3. launch CUDA kernel
|
||||
const dim3 threads_per_bloc{32, 32, 1};
|
||||
auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
|
||||
add<<<blocks, threads_per_bloc>>>(dx, dy, rows, cols);
|
||||
|
||||
// 4. copy result from device to host
|
||||
CUDA_CHECK(cudaMemcpy(x, dx, size, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost));
|
||||
|
||||
// 5. free device memory
|
||||
CUDA_CHECK(cudaFree(dx));
|
||||
CUDA_CHECK(cudaFree(dy));
|
||||
|
||||
// checking results
|
||||
bool ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const int expected_result = std::pow(-1, i) * i + i;
|
||||
if (y[i] != expected_result) {
|
||||
std::cout << "Failure" << std::endl;
|
||||
std::cout << "Result at index i=" << i << ": expected "
|
||||
<< std::pow(-1, i) * i << '+' << i << '='
|
||||
<< expected_result << ", got " << y[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue