diff --git a/gpu/tp5/c/build.sh b/gpu/tp5/c/build.sh index 82404bf..22ebca9 100755 --- a/gpu/tp5/c/build.sh +++ b/gpu/tp5/c/build.sh @@ -13,6 +13,7 @@ fi mkdir -p bin +cc=hipcc ccargs="-O2" #ccargs="$ccargs -g -G -Xcompiler -fsanitize=address" @@ -21,5 +22,5 @@ do sources="$MODULES $target" inputs="$(for src in $sources; do echo "src/$src"; done | xargs)" rm -f bin/${target}.out - nvcc $ccargs -o bin/${target}.out $modules $inputs + $cc $ccargs -o bin/${target}.out $modules $inputs done diff --git a/gpu/tp5/c/src/matrix.cu b/gpu/tp5/c/src/matrix.cu index 24f5d9c..5e5a350 100644 --- a/gpu/tp5/c/src/matrix.cu +++ b/gpu/tp5/c/src/matrix.cu @@ -163,24 +163,56 @@ namespace kernel { // step 05 // return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj) // -__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) { - auto local_x = j; - auto local_y = i; +__device__ int index2(int y, int x, int by, int bx, int height, int width) { + auto local_x = x; + auto local_y = y; auto local_matrix_width = T; - auto base_x = bj * local_matrix_width; - auto base_y = bi * local_matrix_width; + auto base_x = bx * local_matrix_width; + auto base_y = by * local_matrix_width; - auto x = base_x + local_x; - auto y = base_y + local_y; - return index1(y, x, rows, cols); + auto global_x = base_x + local_x; + auto global_y = base_y + local_y; + return index1(global_y, global_x, height, width); } // // step 06 // __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) { - auto step_count = (); // + + auto A_height = N; + auto A_width = M; + + auto B_height = A_width; + auto B_width = P; + + auto result_height = A_height; + auto result_width = B_width; + + auto result = 0; + auto thread_x = threadIdx.x; + auto thread_y = threadIdx.y; + auto step_count = (A_width / T) + 1; + for (RANGE(step_index, 0, step_count)) { + __shared__ int local_A[T][T]; + __shared__ int local_B[T][T]; + auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width); + auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width); + if (index_A == -1) return; + if (index_B == -1) return; + local_A[thread_x][thread_y] = A[index_A]; + local_B[thread_x][thread_y] = B[index_B]; + __syncthreads(); + for (RANGE(i, 0, T)) { + auto a = local_A[i][thread_y]; + auto b = local_B[thread_x][i]; + result += a * b; + } + __syncthreads(); + } + auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width); + C[index] = result; } } // namespace kernel @@ -190,9 +222,28 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) // std::vector matmul3(const std::vector& A, const std::vector& B, int N, int M, int P) { // - // step 07 + // step 04 // - std::vector C(N * P); + auto A_height = N; + auto A_width = M; + auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height); - return C; + auto B_height = A_width; + auto B_width = P; + auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height); + + auto result_height = A_height; + auto result_width = B_width; + auto result_dev = cuda_malloc(A_height * B_width); + + auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1); + auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1); + kernel::matmul3<<>>(A_dev, B_dev, result_dev, A_height, A_width, B_width); + + CUDA_CHECK(cudaFree(A_dev)); + CUDA_CHECK(cudaFree(B_dev)); + auto result = cuda_into_host(result_dev, result_width * result_height); + cudaFree(result_dev); + + return result; }