gpu

2024-03-28 17:58:33 +01:00 · 2024-03-28 17:58:33 +01:00 · d976cfaf74
commit d976cfaf74
parent 8b3bb9c382
37 changed files with 2669 additions and 371 deletions
--- a/gpu/tp2/c/src/ex1.cu
+++ b/gpu/tp2/c/src/ex1.cu
@ -3,76 +3,97 @@
 //
 // example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
 //
-#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
-inline void cuda_check(cudaError_t code, const char *file, int line) {
-    if(code != cudaSuccess) {
-        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
-        std::abort();
-    }
+#define CUDA_CHECK(code)                                                       \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] "
+				  << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
 }

-
 //
 // step 01
 // return the linear index corresponding to the element at row i and column j
 // in a matrix of size rows x cols, using row-major storage
 //
 __device__ int linear_index(int i, int j, int rows, int cols) {
-    
+	if (i >= rows)
+		return -1;
+	if (j >= cols)
+		return -1;
+	return i * cols + j;
 }

 //
 // step 02
-// CUDA kernel add 
-//
-
-
-int main()
-{
-    constexpr int rows = 200;
-    constexpr int cols = 80;
-    int* x = (int*)malloc(rows*cols*sizeof(int));
-    int* y = (int*)malloc(rows*cols*sizeof(int));
-    for(int i = 0; i < rows*cols; ++i) {
-        x[i] = i;
-        y[i] = std::pow(-1,i) * i;
-    }
-
-    //
-    // step 03
-    //
-    int* dx;
-    int* dy;
-    // 1. allocate on device
-
-    // 2. copy from host to device
-
-    // 3. launch CUDA kernel
-    // const dim3 threads_per_bloc{32,32,1};
-
-    // 4. copy result from device to host
-
-    // 5. free device memory
-
-
-
-    // checking results
-    bool ok = true;
-    for(int i = 0; i < rows*cols; ++i) {
-        const int expected_result = std::pow(-1,i) * i + i;
-        if(y[i] != expected_result) {
-            std::cout << "Failure" << std::endl;
-            std::cout << "Result at index i=" 
-                << i << ": expected " 
-                << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
-            ok = false;
-            break;
-        }
-    }
-    if(ok) std::cout << "Success" << std::endl;
-
-    free(x);
-    free(y);
-    
-    return 0;
+// CUDA kernel add
+__global__ void add(const int* dx, int* dy, int rows, int cols) {
+	auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
+	auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
+	auto index = linear_index(j, i, rows, cols);
+	if (index == -1)
+		return;
+	auto res = dx[index] + dy[index];
+	dy[index] = res;
+}
+
+int main() {
+	constexpr int rows = 200;
+	constexpr int cols = 80;
+	int* x = (int*)malloc(rows * cols * sizeof(int));
+	int* y = (int*)malloc(rows * cols * sizeof(int));
+	for (int i = 0; i < rows * cols; ++i) {
+		x[i] = i;
+		y[i] = std::pow(-1, i) * i;
+	}
+
+	//
+	// step 03
+	//
+	int* dx;
+	int* dy;
+	// 1. allocate on device
+	auto size = rows * cols * sizeof(int);
+	CUDA_CHECK(cudaMalloc(&dx, size));
+	CUDA_CHECK(cudaMalloc(&dy, size));
+
+	// 2. copy from host to device
+	CUDA_CHECK(cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice));
+
+	// 3. launch CUDA kernel
+	const dim3 threads_per_bloc{32, 32, 1};
+	auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
+	add<<<blocks, threads_per_bloc>>>(dx, dy, rows, cols);
+
+	// 4. copy result from device to host
+	CUDA_CHECK(cudaMemcpy(x, dx, size, cudaMemcpyDeviceToHost));
+	CUDA_CHECK(cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost));
+
+	// 5. free device memory
+	CUDA_CHECK(cudaFree(dx));
+	CUDA_CHECK(cudaFree(dy));
+
+	// checking results
+	bool ok = true;
+	for (int i = 0; i < rows * cols; ++i) {
+		const int expected_result = std::pow(-1, i) * i + i;
+		if (y[i] != expected_result) {
+			std::cout << "Failure" << std::endl;
+			std::cout << "Result at index i=" << i << ": expected "
+					  << std::pow(-1, i) * i << '+' << i << '='
+					  << expected_result << ", got " << y[i] << std::endl;
+			ok = false;
+			break;
+		}
+	}
+	if (ok)
+		std::cout << "Success" << std::endl;
+
+	free(x);
+	free(y);
+
+	return 0;
 }