gpu

2024-03-28 17:58:33 +01:00 · 2024-03-28 17:58:33 +01:00 · d976cfaf74
commit d976cfaf74
parent 8b3bb9c382
37 changed files with 2669 additions and 371 deletions
--- a/gpu/tp5/c/build.sh
+++ b/gpu/tp5/c/build.sh
@ -0,0 +1,25 @@
+#!/bin/sh
+cd "$(dirname "$(realpath "$0")")"
+set -e
+alias log="echo '[build.sh]'"
+
+TARGET="main.cu"
+MODULES="matrix.cu"
+
+if [ $# -gt 0 ]
+then targets="$@"
+else targets="$TARGET"
+fi
+
+mkdir -p bin
+
+ccargs="-O2"
+#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
+
+for target in $targets
+do
+	sources="$MODULES $target"
+	inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
+	rm -f           bin/${target}.out
+	nvcc $ccargs -o bin/${target}.out $modules $inputs
+done
--- a/gpu/tp5/c/src/main.cu
+++ b/gpu/tp5/c/src/main.cu
--- a/gpu/tp5/c/src/matrix.cu
+++ b/gpu/tp5/c/src/matrix.cu
@ -0,0 +1,198 @@
+#include "matrix.h"
+#include <cstddef>
+#include <cstdlib>
+#include <vector>
+
+#define SEP ;
+#define RANGE(I, FROM, TO) size_t I = FROM SEP I < TO SEP I += 1
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+//
+// step 01
+// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
+//
+__host__ __device__ int index1(int i, int j, int rows, int cols) {
+	if (i < 0 || i >= rows) return -1;
+	if (j < 0 || j >= cols) return -1;
+	return (i * cols) + j;
+}
+
+template <typename T>
+__host__ __device__ inline int get_2d(const T* matrix, size_t x, size_t y, size_t width, size_t height) {
+	auto index = index1(y, x, height, width);
+	return matrix[index];
+}
+
+template <typename T>
+__host__ __device__ inline void set_2d(T* matrix, int item, size_t x, size_t y, size_t width, size_t height) {
+	auto index = index1(y, x, height, width);
+	matrix[index] = item;
+}
+
+//
+// CPU
+//
+std::vector<int> matmul1(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
+	//
+	// step 02
+	//
+
+	auto A_height = N;
+	auto A_width = M;
+	auto B_height = A_width;
+	auto B_width = P;
+
+	auto result = std::vector<int>(N * P);
+	auto result_height = A_height;
+	auto result_width = B_width;
+
+	for (RANGE(x, 0, result_width)) {
+		for (RANGE(y, 0, result_height)) {
+			auto sum = 0;
+			for (RANGE(i, 0, A_width)) {
+				auto item_a = get_2d(A.data(), i, y, A_width, A_height);
+				auto item_b = get_2d(B.data(), x, i, B_width, B_height);
+				sum += (item_a * item_b);
+			}
+			set_2d(result.data(), sum, x, y, result_width, result_height);
+		}
+	}
+
+	return result;
+}
+
+namespace kernel {
+
+#define THREAD_GID(COORD) ((blockDim.COORD * blockIdx.COORD) + threadIdx.COORD)
+
+//
+// step 03
+//
+__global__ void matmul2(const int* A, const int* B, int* C, int N, int M, int P) {
+	auto A_height = N;
+	auto A_width = M;
+	auto B_height = A_width;
+	auto B_width = P;
+
+	auto result = C;
+	auto result_height = A_height;
+	auto result_width = B_width;
+
+	auto x = THREAD_GID(x);
+	auto y = THREAD_GID(y);
+	if (x >= result_width) return;
+	if (y >= result_height) return;
+
+	auto sum = 0;
+	for (RANGE(i, 0, A_width)) {
+		auto item_a = get_2d(A, i, y, A_width, A_height);
+		auto item_b = get_2d(B, x, i, B_width, B_height);
+		sum += (item_a * item_b);
+	}
+	set_2d(result, sum, x, y, result_width, result_height);
+	return;
+}
+} // namespace kernel
+
+template <typename T> inline T* cuda_malloc(size_t item_count = 1) {
+	T* result = nullptr;
+	auto size = item_count * sizeof(T);
+	CUDA_CHECK(cudaMalloc(&result, size));
+	return result;
+}
+
+template <typename T> inline T* cuda_malloc_copy(const T* source, size_t item_count = 1) {
+	auto result = cuda_malloc<T>(item_count);
+	auto size = item_count * sizeof(T);
+	CUDA_CHECK(cudaMemcpy(result, source, size, cudaMemcpyHostToDevice));
+	return result;
+}
+
+template <typename T> inline std::vector<T> cuda_into_host(const T* allocation, size_t item_count = 1) {
+	auto size = item_count * sizeof(T);
+	auto result = std::vector<T>(item_count);
+	CUDA_CHECK(cudaMemcpy(result.data(), allocation, size, cudaMemcpyDeviceToHost));
+	return result;
+}
+
+//
+// GPU
+//
+std::vector<int> matmul2(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
+	//
+	// step 04
+	//
+	auto A_height = N;
+	auto A_width = M;
+	auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
+
+	auto B_height = A_width;
+	auto B_width = P;
+	auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
+
+	auto result_height = A_height;
+	auto result_width = B_width;
+	auto result_dev = cuda_malloc<int>(A_height * B_width);
+
+	auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
+	auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
+	kernel::matmul2<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
+
+	CUDA_CHECK(cudaFree(A_dev));
+	CUDA_CHECK(cudaFree(B_dev));
+	auto result = cuda_into_host(result_dev, result_width * result_height);
+	cudaFree(result_dev);
+
+	return result;
+}
+
+namespace kernel {
+
+//
+// step 05
+// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
+//
+__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
+	auto local_x = j;
+	auto local_y = i;
+
+	auto local_matrix_width = T;
+	auto base_x = bj * local_matrix_width;
+	auto base_y = bi * local_matrix_width;
+
+	auto x = base_x + local_x;
+	auto y = base_y + local_y;
+	return index1(y, x, rows, cols);
+}
+
+//
+// step 06
+//
+__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
+	auto step_count = (); //
+}
+
+} // namespace kernel
+
+//
+// GPU by bloc
+//
+std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
+	//
+	// step 07
+	//
+	std::vector<int> C(N * P);
+
+	return C;
+}
--- a/gpu/tp5/c/src/matrix.h
+++ b/gpu/tp5/c/src/matrix.h
@ -0,0 +1,31 @@
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+constexpr int     threads_per_bloc = 16;
+constexpr int T = threads_per_bloc;
+
+//
+// CPU
+//
+std::vector<int> matmul1(
+    const std::vector<int>& A,
+    const std::vector<int>& B,
+    int N, int M, int P);
+
+//
+// GPU
+//
+std::vector<int> matmul2(
+    const std::vector<int>& A,
+    const std::vector<int>& B,
+    int N, int M, int P);
+
+//
+// GPU by bloc
+//
+std::vector<int> matmul3(
+    const std::vector<int>& A,
+    const std::vector<int>& B,
+    int N, int M, int P);