#include // // example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) ); // #define CUDA_CHECK(code) \ { cuda_check((code), __FILE__, __LINE__); } inline void cuda_check(cudaError_t code, const char* file, int line) { if (code != cudaSuccess) { std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; std::abort(); } } #define FMTVEC3(X) "(" << X.x << "," << X.y << "," << X.z << ")" // // step 04 // return a pointer to the value at row i and column j from base_address // with pitch in bytes // __device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) { auto offset = i * pitch + (j * sizeof(int)); auto ptr = (char*)base_address; return (int*)(ptr + offset); } // // step 05 // CUDA kernel add __global__ void add_(int* a, int* b, size_t pitch, size_t width, size_t height) { auto x = (blockIdx.x * blockDim.x) + threadIdx.x; auto y = (blockIdx.y * blockDim.y) + threadIdx.y; if (x >= width) return; if (y >= height) return; auto ptr_a = get_ptr(a, y, x, pitch); auto ptr_b = get_ptr(b, y, x, pitch); auto res = *ptr_a + *ptr_b; *ptr_b = res; } int main() { constexpr int rows = 200; constexpr int cols = 80; int* x = (int*)malloc(rows * cols * sizeof(int)); int* y = (int*)malloc(rows * cols * sizeof(int)); for (int i = 0; i < rows * cols; ++i) { x[i] = i; y[i] = std::pow(-1, i) * i; } // // step 06 // int* dx; int* dy; size_t pitch; // 1. allocate on device CUDA_CHECK(cudaMallocPitch(&dx, &pitch, cols * sizeof(int), rows)); CUDA_CHECK(cudaMallocPitch(&dy, &pitch, cols * sizeof(int), rows)); // 2. copy from host to device auto arr_width = cols * sizeof(int); CUDA_CHECK(cudaMemcpy2D(dx, pitch, // x, arr_width, // cols * sizeof(int), rows, // cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy2D(dy, pitch, // y, arr_width, // cols * sizeof(int), rows, // cudaMemcpyHostToDevice)); // 3. launch CUDA kernel const auto threads_per_bloc = dim3(32, 32, 1); const auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1); add_<<>>(dx, dy, pitch, cols, rows); // 4. copy result from device to host CUDA_CHECK(cudaMemcpy2D(y, arr_width, // dy, pitch, // cols * sizeof(int), rows, // cudaMemcpyDeviceToHost)); // 5. free device memory cudaFree(dx); cudaFree(dy); // checking results bool ok = true; for (int i = 0; i < rows * cols; ++i) { const int expected_result = std::pow(-1, i) * i + i; if (y[i] != expected_result) { std::cout << "Failure" << std::endl; std::cout << "Result at index i=" << i << ": expected " << std::pow(-1, i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl; ok = false; break; } } if (ok) std::cout << "Success" << std::endl; free(x); free(y); return 0; }