This commit is contained in:
JOLIMAITRE Matthieu 2024-03-28 17:58:33 +01:00
parent 8b3bb9c382
commit d976cfaf74
37 changed files with 2669 additions and 371 deletions

1132
gpu/tp5/c/src/main.cu Normal file

File diff suppressed because it is too large Load diff

198
gpu/tp5/c/src/matrix.cu Normal file
View file

@ -0,0 +1,198 @@
#include "matrix.h"
#include <cstddef>
#include <cstdlib>
#include <vector>
#define SEP ;
#define RANGE(I, FROM, TO) size_t I = FROM SEP I < TO SEP I += 1
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
//
// step 01
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
//
__host__ __device__ int index1(int i, int j, int rows, int cols) {
if (i < 0 || i >= rows) return -1;
if (j < 0 || j >= cols) return -1;
return (i * cols) + j;
}
template <typename T>
__host__ __device__ inline int get_2d(const T* matrix, size_t x, size_t y, size_t width, size_t height) {
auto index = index1(y, x, height, width);
return matrix[index];
}
template <typename T>
__host__ __device__ inline void set_2d(T* matrix, int item, size_t x, size_t y, size_t width, size_t height) {
auto index = index1(y, x, height, width);
matrix[index] = item;
}
//
// CPU
//
std::vector<int> matmul1(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 02
//
auto A_height = N;
auto A_width = M;
auto B_height = A_width;
auto B_width = P;
auto result = std::vector<int>(N * P);
auto result_height = A_height;
auto result_width = B_width;
for (RANGE(x, 0, result_width)) {
for (RANGE(y, 0, result_height)) {
auto sum = 0;
for (RANGE(i, 0, A_width)) {
auto item_a = get_2d(A.data(), i, y, A_width, A_height);
auto item_b = get_2d(B.data(), x, i, B_width, B_height);
sum += (item_a * item_b);
}
set_2d(result.data(), sum, x, y, result_width, result_height);
}
}
return result;
}
namespace kernel {
#define THREAD_GID(COORD) ((blockDim.COORD * blockIdx.COORD) + threadIdx.COORD)
//
// step 03
//
__global__ void matmul2(const int* A, const int* B, int* C, int N, int M, int P) {
auto A_height = N;
auto A_width = M;
auto B_height = A_width;
auto B_width = P;
auto result = C;
auto result_height = A_height;
auto result_width = B_width;
auto x = THREAD_GID(x);
auto y = THREAD_GID(y);
if (x >= result_width) return;
if (y >= result_height) return;
auto sum = 0;
for (RANGE(i, 0, A_width)) {
auto item_a = get_2d(A, i, y, A_width, A_height);
auto item_b = get_2d(B, x, i, B_width, B_height);
sum += (item_a * item_b);
}
set_2d(result, sum, x, y, result_width, result_height);
return;
}
} // namespace kernel
template <typename T> inline T* cuda_malloc(size_t item_count = 1) {
T* result = nullptr;
auto size = item_count * sizeof(T);
CUDA_CHECK(cudaMalloc(&result, size));
return result;
}
template <typename T> inline T* cuda_malloc_copy(const T* source, size_t item_count = 1) {
auto result = cuda_malloc<T>(item_count);
auto size = item_count * sizeof(T);
CUDA_CHECK(cudaMemcpy(result, source, size, cudaMemcpyHostToDevice));
return result;
}
template <typename T> inline std::vector<T> cuda_into_host(const T* allocation, size_t item_count = 1) {
auto size = item_count * sizeof(T);
auto result = std::vector<T>(item_count);
CUDA_CHECK(cudaMemcpy(result.data(), allocation, size, cudaMemcpyDeviceToHost));
return result;
}
//
// GPU
//
std::vector<int> matmul2(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 04
//
auto A_height = N;
auto A_width = M;
auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
auto B_height = A_width;
auto B_width = P;
auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
auto result_height = A_height;
auto result_width = B_width;
auto result_dev = cuda_malloc<int>(A_height * B_width);
auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
kernel::matmul2<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
CUDA_CHECK(cudaFree(A_dev));
CUDA_CHECK(cudaFree(B_dev));
auto result = cuda_into_host(result_dev, result_width * result_height);
cudaFree(result_dev);
return result;
}
namespace kernel {
//
// step 05
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
//
__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
auto local_x = j;
auto local_y = i;
auto local_matrix_width = T;
auto base_x = bj * local_matrix_width;
auto base_y = bi * local_matrix_width;
auto x = base_x + local_x;
auto y = base_y + local_y;
return index1(y, x, rows, cols);
}
//
// step 06
//
__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
auto step_count = (); //
}
} // namespace kernel
//
// GPU by bloc
//
std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 07
//
std::vector<int> C(N * P);
return C;
}

31
gpu/tp5/c/src/matrix.h Normal file
View file

@ -0,0 +1,31 @@
#pragma once
#include <vector>
#include <iostream>
constexpr int threads_per_bloc = 16;
constexpr int T = threads_per_bloc;
//
// CPU
//
std::vector<int> matmul1(
const std::vector<int>& A,
const std::vector<int>& B,
int N, int M, int P);
//
// GPU
//
std::vector<int> matmul2(
const std::vector<int>& A,
const std::vector<int>& B,
int N, int M, int P);
//
// GPU by bloc
//
std::vector<int> matmul3(
const std::vector<int>& A,
const std::vector<int>& B,
int N, int M, int P);