gpu
This commit is contained in:
parent
8e4fd3db42
commit
656356fb3b
2 changed files with 8 additions and 8 deletions
|
@ -13,7 +13,7 @@ fi
|
||||||
|
|
||||||
mkdir -p bin
|
mkdir -p bin
|
||||||
|
|
||||||
cc=hipcc
|
cc=nvcc
|
||||||
ccargs="-O2"
|
ccargs="-O2"
|
||||||
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
|
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
|
||||||
|
|
||||||
|
|
|
@ -22,10 +22,10 @@ inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||||
// step 01
|
// step 01
|
||||||
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
|
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
|
||||||
//
|
//
|
||||||
__host__ __device__ int index1(int i, int j, int rows, int cols) {
|
__host__ __device__ int index1(int y, int x, int height, int width) {
|
||||||
if (i < 0 || i >= rows) return -1;
|
if (y < 0 || y >= height) return -1;
|
||||||
if (j < 0 || j >= cols) return -1;
|
if (x < 0 || x >= width) return -1;
|
||||||
return (i * cols) + j;
|
return (y * width) + x;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -189,6 +189,8 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P)
|
||||||
|
|
||||||
auto result_height = A_height;
|
auto result_height = A_height;
|
||||||
auto result_width = B_width;
|
auto result_width = B_width;
|
||||||
|
auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width);
|
||||||
|
if (index == -1) return;
|
||||||
|
|
||||||
auto result = 0;
|
auto result = 0;
|
||||||
auto thread_x = threadIdx.x;
|
auto thread_x = threadIdx.x;
|
||||||
|
@ -199,8 +201,7 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P)
|
||||||
__shared__ int local_B[T][T];
|
__shared__ int local_B[T][T];
|
||||||
auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width);
|
auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width);
|
||||||
auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width);
|
auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width);
|
||||||
if (index_A == -1) return;
|
if (index_A == -1 || index_B == -1) continue;
|
||||||
if (index_B == -1) return;
|
|
||||||
local_A[thread_x][thread_y] = A[index_A];
|
local_A[thread_x][thread_y] = A[index_A];
|
||||||
local_B[thread_x][thread_y] = B[index_B];
|
local_B[thread_x][thread_y] = B[index_B];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -211,7 +212,6 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P)
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width);
|
|
||||||
C[index] = result;
|
C[index] = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue