This commit is contained in:
JOLIMAITRE Matthieu 2024-03-29 12:19:25 +01:00
parent d976cfaf74
commit 8e4fd3db42
2 changed files with 65 additions and 13 deletions

View file

@ -163,24 +163,56 @@ namespace kernel {
// step 05
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
//
__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
auto local_x = j;
auto local_y = i;
__device__ int index2(int y, int x, int by, int bx, int height, int width) {
auto local_x = x;
auto local_y = y;
auto local_matrix_width = T;
auto base_x = bj * local_matrix_width;
auto base_y = bi * local_matrix_width;
auto base_x = bx * local_matrix_width;
auto base_y = by * local_matrix_width;
auto x = base_x + local_x;
auto y = base_y + local_y;
return index1(y, x, rows, cols);
auto global_x = base_x + local_x;
auto global_y = base_y + local_y;
return index1(global_y, global_x, height, width);
}
//
// step 06
//
__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
auto step_count = (); //
auto A_height = N;
auto A_width = M;
auto B_height = A_width;
auto B_width = P;
auto result_height = A_height;
auto result_width = B_width;
auto result = 0;
auto thread_x = threadIdx.x;
auto thread_y = threadIdx.y;
auto step_count = (A_width / T) + 1;
for (RANGE(step_index, 0, step_count)) {
__shared__ int local_A[T][T];
__shared__ int local_B[T][T];
auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width);
auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width);
if (index_A == -1) return;
if (index_B == -1) return;
local_A[thread_x][thread_y] = A[index_A];
local_B[thread_x][thread_y] = B[index_B];
__syncthreads();
for (RANGE(i, 0, T)) {
auto a = local_A[i][thread_y];
auto b = local_B[thread_x][i];
result += a * b;
}
__syncthreads();
}
auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width);
C[index] = result;
}
} // namespace kernel
@ -190,9 +222,28 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P)
//
std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 07
// step 04
//
std::vector<int> C(N * P);
auto A_height = N;
auto A_width = M;
auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
return C;
auto B_height = A_width;
auto B_width = P;
auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
auto result_height = A_height;
auto result_width = B_width;
auto result_dev = cuda_malloc<int>(A_height * B_width);
auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
kernel::matmul3<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
CUDA_CHECK(cudaFree(A_dev));
CUDA_CHECK(cudaFree(B_dev));
auto result = cuda_into_host(result_dev, result_width * result_height);
cudaFree(result_dev);
return result;
}