diff --git a/gpu/tp5/c/build.sh b/gpu/tp5/c/build.sh index 22ebca9..1c788f8 100755 --- a/gpu/tp5/c/build.sh +++ b/gpu/tp5/c/build.sh @@ -13,7 +13,7 @@ fi mkdir -p bin -cc=hipcc +cc=nvcc ccargs="-O2" #ccargs="$ccargs -g -G -Xcompiler -fsanitize=address" diff --git a/gpu/tp5/c/src/matrix.cu b/gpu/tp5/c/src/matrix.cu index 5e5a350..9b8b3f0 100644 --- a/gpu/tp5/c/src/matrix.cu +++ b/gpu/tp5/c/src/matrix.cu @@ -22,10 +22,10 @@ inline void cuda_check(cudaError_t code, const char* file, int line) { // step 01 // return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) // -__host__ __device__ int index1(int i, int j, int rows, int cols) { - if (i < 0 || i >= rows) return -1; - if (j < 0 || j >= cols) return -1; - return (i * cols) + j; +__host__ __device__ int index1(int y, int x, int height, int width) { + if (y < 0 || y >= height) return -1; + if (x < 0 || x >= width) return -1; + return (y * width) + x; } template @@ -189,6 +189,8 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) auto result_height = A_height; auto result_width = B_width; + auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width); + if (index == -1) return; auto result = 0; auto thread_x = threadIdx.x; @@ -199,8 +201,7 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) __shared__ int local_B[T][T]; auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width); auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width); - if (index_A == -1) return; - if (index_B == -1) return; + if (index_A == -1 || index_B == -1) continue; local_A[thread_x][thread_y] = A[index_A]; local_B[thread_x][thread_y] = B[index_B]; __syncthreads(); @@ -211,7 +212,6 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) } __syncthreads(); } - auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width); C[index] = result; }