gpu
This commit is contained in:
parent
d976cfaf74
commit
8e4fd3db42
2 changed files with 65 additions and 13 deletions
|
@ -163,24 +163,56 @@ namespace kernel {
|
|||
// step 05
|
||||
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
|
||||
//
|
||||
__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
|
||||
auto local_x = j;
|
||||
auto local_y = i;
|
||||
__device__ int index2(int y, int x, int by, int bx, int height, int width) {
|
||||
auto local_x = x;
|
||||
auto local_y = y;
|
||||
|
||||
auto local_matrix_width = T;
|
||||
auto base_x = bj * local_matrix_width;
|
||||
auto base_y = bi * local_matrix_width;
|
||||
auto base_x = bx * local_matrix_width;
|
||||
auto base_y = by * local_matrix_width;
|
||||
|
||||
auto x = base_x + local_x;
|
||||
auto y = base_y + local_y;
|
||||
return index1(y, x, rows, cols);
|
||||
auto global_x = base_x + local_x;
|
||||
auto global_y = base_y + local_y;
|
||||
return index1(global_y, global_x, height, width);
|
||||
}
|
||||
|
||||
//
|
||||
// step 06
|
||||
//
|
||||
__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
|
||||
auto step_count = (); //
|
||||
|
||||
auto A_height = N;
|
||||
auto A_width = M;
|
||||
|
||||
auto B_height = A_width;
|
||||
auto B_width = P;
|
||||
|
||||
auto result_height = A_height;
|
||||
auto result_width = B_width;
|
||||
|
||||
auto result = 0;
|
||||
auto thread_x = threadIdx.x;
|
||||
auto thread_y = threadIdx.y;
|
||||
auto step_count = (A_width / T) + 1;
|
||||
for (RANGE(step_index, 0, step_count)) {
|
||||
__shared__ int local_A[T][T];
|
||||
__shared__ int local_B[T][T];
|
||||
auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width);
|
||||
auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width);
|
||||
if (index_A == -1) return;
|
||||
if (index_B == -1) return;
|
||||
local_A[thread_x][thread_y] = A[index_A];
|
||||
local_B[thread_x][thread_y] = B[index_B];
|
||||
__syncthreads();
|
||||
for (RANGE(i, 0, T)) {
|
||||
auto a = local_A[i][thread_y];
|
||||
auto b = local_B[thread_x][i];
|
||||
result += a * b;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width);
|
||||
C[index] = result;
|
||||
}
|
||||
|
||||
} // namespace kernel
|
||||
|
@ -190,9 +222,28 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P)
|
|||
//
|
||||
std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
|
||||
//
|
||||
// step 07
|
||||
// step 04
|
||||
//
|
||||
std::vector<int> C(N * P);
|
||||
auto A_height = N;
|
||||
auto A_width = M;
|
||||
auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
|
||||
|
||||
return C;
|
||||
auto B_height = A_width;
|
||||
auto B_width = P;
|
||||
auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
|
||||
|
||||
auto result_height = A_height;
|
||||
auto result_width = B_width;
|
||||
auto result_dev = cuda_malloc<int>(A_height * B_width);
|
||||
|
||||
auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
|
||||
auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
|
||||
kernel::matmul3<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
|
||||
|
||||
CUDA_CHECK(cudaFree(A_dev));
|
||||
CUDA_CHECK(cudaFree(B_dev));
|
||||
auto result = cuda_into_host(result_dev, result_width * result_height);
|
||||
cudaFree(result_dev);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue