notes-ing2/gpu/tp2/c/src/ex2.cu

#include <iostream>

//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code)                                                       \
	{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
	if (code != cudaSuccess) {
		std::cout << file << ':' << line << ": [CUDA ERROR] "
				  << cudaGetErrorString(code) << std::endl;
		std::abort();
	}
}

#define FMTVEC3(X) "(" << X.x << "," << X.y << "," << X.z << ")"

//
// step 04
// return a pointer to the value at row i and column j from base_address
// with pitch in bytes
//
__device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) {
	auto offset = i * pitch + (j * sizeof(int));
	auto ptr = (char*)base_address;
	return (int*)(ptr + offset);
}

//
// step 05
// CUDA kernel add
__global__ void add_(int* a, int* b, size_t pitch, size_t width,
					 size_t height) {
	auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
	auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
	if (x >= width)
		return;
	if (y >= height)
		return;
	auto ptr_a = get_ptr(a, y, x, pitch);
	auto ptr_b = get_ptr(b, y, x, pitch);
	auto res = *ptr_a + *ptr_b;
	*ptr_b = res;
}

int main() {
	constexpr int rows = 200;
	constexpr int cols = 80;
	int* x = (int*)malloc(rows * cols * sizeof(int));
	int* y = (int*)malloc(rows * cols * sizeof(int));
	for (int i = 0; i < rows * cols; ++i) {
		x[i] = i;
		y[i] = std::pow(-1, i) * i;
	}

	//
	// step 06
	//
	int* dx;
	int* dy;
	size_t pitch;
	// 1. allocate on device
	CUDA_CHECK(cudaMallocPitch(&dx, &pitch, cols * sizeof(int), rows));
	CUDA_CHECK(cudaMallocPitch(&dy, &pitch, cols * sizeof(int), rows));

	// 2. copy from host to device
	auto arr_width = cols * sizeof(int);
	CUDA_CHECK(cudaMemcpy2D(dx, pitch,				  //
							x, arr_width,			  //
							cols * sizeof(int), rows, //
							cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy2D(dy, pitch,				  //
							y, arr_width,			  //
							cols * sizeof(int), rows, //
							cudaMemcpyHostToDevice));

	// 3. launch CUDA kernel
	const auto threads_per_bloc = dim3(32, 32, 1);
	const auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
	add_<<<blocks, threads_per_bloc>>>(dx, dy, pitch, cols, rows);

	// 4. copy result from device to host
	CUDA_CHECK(cudaMemcpy2D(y, arr_width,			  //
							dy, pitch,				  //
							cols * sizeof(int), rows, //
							cudaMemcpyDeviceToHost));

	// 5. free device memory
	cudaFree(dx);
	cudaFree(dy);

	// checking results
	bool ok = true;
	for (int i = 0; i < rows * cols; ++i) {
		const int expected_result = std::pow(-1, i) * i + i;
		if (y[i] != expected_result) {
			std::cout << "Failure" << std::endl;
			std::cout << "Result at index i=" << i << ": expected "
					  << std::pow(-1, i) * i << '+' << i << '='
					  << expected_result << ", got " << y[i] << std::endl;
			ok = false;
			break;
		}
	}
	if (ok)
		std::cout << "Success" << std::endl;

	free(x);
	free(y);

	return 0;
}