gpu

2024-03-28 17:58:33 +01:00 · 2024-03-28 17:58:33 +01:00 · d976cfaf74
commit d976cfaf74
parent 8b3bb9c382
37 changed files with 2669 additions and 371 deletions
--- a/gpu/tp3/c/src/ex1.cu
+++ b/gpu/tp3/c/src/ex1.cu
@ -0,0 +1,106 @@
+#include <cstddef>
+#include <iostream>
+
+#define RANGE(I, FROM, TO)                                                                                             \
+	size_t I = FROM;                                                                                                   \
+	I < TO;                                                                                                            \
+	I += 1
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+constexpr int bloc_count = 128;		 // constexpr equivalent to blockDim.x in CUDA kernel
+constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x  in CUDA kernel
+
+constexpr int B = bloc_count;
+constexpr int T = threads_per_bloc;
+
+//
+// step 01
+//
+// dx: array of size N
+// dy: array of size N
+// dz: array of size B
+//
+
+typedef struct {
+	size_t from;
+	size_t to;
+} StrideRange;
+#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
+
+__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
+															   size_t block_id, size_t thread_id) {
+	auto global_threads = block_dim * grid_dim;
+	auto items_per_threads = (array_length / global_threads) + 1;
+	auto global_thread_index = block_id * block_dim + thread_id;
+	auto from = global_thread_index * items_per_threads;
+	auto to = from + items_per_threads;
+	return StrideRange{from, to};
+}
+
+__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
+	__shared__ int buffer[T];
+	auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
+	if (range.from >= N) return;
+	buffer[threadIdx.x] = 0;
+	for (RANGE(i, range.from, range.to))
+		if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
+	__syncthreads();
+	if (threadIdx.x != 0) return;
+	dz[blockIdx.x] = 0;
+	for (RANGE(i, 0, T)) dz[blockIdx.x] += buffer[i];
+}
+
+int main() {
+	constexpr int N = 1e6;
+
+	int* x = (int*)malloc(N * sizeof(int));
+	int* y = (int*)malloc(N * sizeof(int));
+	int host_expected_result = 0;
+	for (int i = 0; i < N; i++) {
+		x[i] = i % 10;
+		y[i] = i % 3 - 1;
+		host_expected_result += x[i] * y[i];
+	}
+
+	// step 02
+	int *dx, *dy, *dz;
+	auto size = N * sizeof(int);
+	auto res_size = B * sizeof(int);
+	cudaMalloc(&dx, size);
+	cudaMalloc(&dy, size);
+	cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
+	cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
+	cudaMalloc(&dz, res_size);
+
+	// step 03
+	dot<<<B, T>>>(N, dx, dy, dz);
+	int result = 0;
+	int* z = (int*)malloc(res_size);
+	cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
+	for (RANGE(i, 0, B)) result += z[i];
+
+	// checking results
+	if (host_expected_result == result) {
+		std::cout << "Success" << std::endl;
+	} else {
+		std::cout << "Error" << std::endl;
+		std::cout << "  expected: " << host_expected_result << std::endl;
+		std::cout << "  got: " << result << std::endl;
+	}
+
+	free(x);
+	free(y);
+
+	return 0;
+}
--- a/gpu/tp3/c/src/ex2.cu
+++ b/gpu/tp3/c/src/ex2.cu
@ -0,0 +1,119 @@
+#include <iostream>
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+constexpr int bloc_count = 128;		 // constexpr equivalent to blockDim.x in CUDA kernel
+constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x  in CUDA kernel
+
+constexpr int B = bloc_count;
+constexpr int T = threads_per_bloc;
+
+//
+// step 04
+//
+// dx: array of size N
+// dy: array of size N
+// dz: array of size B
+//
+
+#define RANGE(I, FROM, TO)                                                                                             \
+	size_t I = FROM;                                                                                                   \
+	I < TO;                                                                                                            \
+	I += 1
+
+#define loop while (1)
+
+typedef struct {
+	size_t from;
+	size_t to;
+} StrideRange;
+#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
+
+__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
+															   size_t block_id, size_t thread_id) {
+	auto global_threads = block_dim * grid_dim;
+	auto items_per_threads = (array_length / global_threads) + 1;
+	auto global_thread_index = block_id * block_dim + thread_id;
+	auto from = global_thread_index * items_per_threads;
+	auto to = from + items_per_threads;
+	return StrideRange{from, to};
+}
+
+__device__ void reduce_rec(int N, int* array) {
+	auto length = N;
+	auto thread_id = threadIdx.x;
+	loop {
+		if (length <= 1) return;
+		auto half = length / 2;
+		auto used_threads = half;
+		if (thread_id >= used_threads) return;
+		__syncthreads();
+		array[thread_id] += array[thread_id + half];
+		length = half;
+	}
+}
+
+__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
+	__shared__ int buffer[T];
+	auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
+	if (range.from >= N) return;
+	buffer[threadIdx.x] = 0;
+	for (RANGE(i, range.from, range.to))
+		if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
+	reduce_rec(T, buffer);
+	if (threadIdx.x != 0) return;
+	dz[blockIdx.x] = buffer[0];
+}
+
+int main() {
+	constexpr int N = 1e6;
+
+	int* x = (int*)malloc(N * sizeof(int));
+	int* y = (int*)malloc(N * sizeof(int));
+	int host_expected_result = 0;
+	for (int i = 0; i < N; i++) {
+		x[i] = i % 10;
+		y[i] = i % 3 - 1;
+		host_expected_result += x[i] * y[i];
+	}
+
+	// step 05
+	int result = 0;
+	int *dx, *dy, *dz;
+	auto size = N * sizeof(int);
+	auto res_size = B * sizeof(int);
+	cudaMalloc(&dx, size);
+	cudaMalloc(&dy, size);
+	cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
+	cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
+	cudaMalloc(&dz, res_size);
+	dot<<<B, T>>>(N, dx, dy, dz);
+	int* z;
+	z = (int*)malloc(res_size);
+	cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
+	for (RANGE(i, 0, B)) result += z[i];
+
+	// checking results
+	if (host_expected_result == result) {
+		std::cout << "Success" << std::endl;
+	} else {
+		std::cout << "Error" << std::endl;
+		std::cout << "  expected: " << host_expected_result << std::endl;
+		std::cout << "  got: " << result << std::endl;
+	}
+
+	free(x);
+	free(y);
+
+	return 0;
+}