112 lines
2.9 KiB
Text
112 lines
2.9 KiB
Text
#include <iostream>
|
|
|
|
//
|
|
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
|
//
|
|
#define CUDA_CHECK(code) \
|
|
{ cuda_check((code), __FILE__, __LINE__); }
|
|
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
|
if (code != cudaSuccess) {
|
|
std::cout << file << ':' << line << ": [CUDA ERROR] "
|
|
<< cudaGetErrorString(code) << std::endl;
|
|
std::abort();
|
|
}
|
|
}
|
|
|
|
#define FMTVEC3(X) "(" << X.x << "," << X.y << "," << X.z << ")"
|
|
|
|
//
|
|
// step 04
|
|
// return a pointer to the value at row i and column j from base_address
|
|
// with pitch in bytes
|
|
//
|
|
__device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) {
|
|
auto offset = i * pitch + (j * sizeof(int));
|
|
auto ptr = (char*)base_address;
|
|
return (int*)(ptr + offset);
|
|
}
|
|
|
|
//
|
|
// step 05
|
|
// CUDA kernel add
|
|
__global__ void add_(int* a, int* b, size_t pitch, size_t width,
|
|
size_t height) {
|
|
auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
|
|
if (x >= width)
|
|
return;
|
|
if (y >= height)
|
|
return;
|
|
auto ptr_a = get_ptr(a, y, x, pitch);
|
|
auto ptr_b = get_ptr(b, y, x, pitch);
|
|
auto res = *ptr_a + *ptr_b;
|
|
*ptr_b = res;
|
|
}
|
|
|
|
int main() {
|
|
constexpr int rows = 200;
|
|
constexpr int cols = 80;
|
|
int* x = (int*)malloc(rows * cols * sizeof(int));
|
|
int* y = (int*)malloc(rows * cols * sizeof(int));
|
|
for (int i = 0; i < rows * cols; ++i) {
|
|
x[i] = i;
|
|
y[i] = std::pow(-1, i) * i;
|
|
}
|
|
|
|
//
|
|
// step 06
|
|
//
|
|
int* dx;
|
|
int* dy;
|
|
size_t pitch;
|
|
// 1. allocate on device
|
|
CUDA_CHECK(cudaMallocPitch(&dx, &pitch, cols * sizeof(int), rows));
|
|
CUDA_CHECK(cudaMallocPitch(&dy, &pitch, cols * sizeof(int), rows));
|
|
|
|
// 2. copy from host to device
|
|
auto arr_width = cols * sizeof(int);
|
|
CUDA_CHECK(cudaMemcpy2D(dx, pitch, //
|
|
x, arr_width, //
|
|
cols * sizeof(int), rows, //
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy2D(dy, pitch, //
|
|
y, arr_width, //
|
|
cols * sizeof(int), rows, //
|
|
cudaMemcpyHostToDevice));
|
|
|
|
// 3. launch CUDA kernel
|
|
const auto threads_per_bloc = dim3(32, 32, 1);
|
|
const auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
|
|
add_<<<blocks, threads_per_bloc>>>(dx, dy, pitch, cols, rows);
|
|
|
|
// 4. copy result from device to host
|
|
CUDA_CHECK(cudaMemcpy2D(y, arr_width, //
|
|
dy, pitch, //
|
|
cols * sizeof(int), rows, //
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
// 5. free device memory
|
|
cudaFree(dx);
|
|
cudaFree(dy);
|
|
|
|
// checking results
|
|
bool ok = true;
|
|
for (int i = 0; i < rows * cols; ++i) {
|
|
const int expected_result = std::pow(-1, i) * i + i;
|
|
if (y[i] != expected_result) {
|
|
std::cout << "Failure" << std::endl;
|
|
std::cout << "Result at index i=" << i << ": expected "
|
|
<< std::pow(-1, i) * i << '+' << i << '='
|
|
<< expected_result << ", got " << y[i] << std::endl;
|
|
ok = false;
|
|
break;
|
|
}
|
|
}
|
|
if (ok)
|
|
std::cout << "Success" << std::endl;
|
|
|
|
free(x);
|
|
free(y);
|
|
|
|
return 0;
|
|
}
|