gpu
This commit is contained in:
parent
8b3bb9c382
commit
d976cfaf74
37 changed files with 2669 additions and 371 deletions
25
gpu/tp5/c/build.sh
Executable file
25
gpu/tp5/c/build.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/bin/sh
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
set -e
|
||||
alias log="echo '[build.sh]'"
|
||||
|
||||
TARGET="main.cu"
|
||||
MODULES="matrix.cu"
|
||||
|
||||
if [ $# -gt 0 ]
|
||||
then targets="$@"
|
||||
else targets="$TARGET"
|
||||
fi
|
||||
|
||||
mkdir -p bin
|
||||
|
||||
ccargs="-O2"
|
||||
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
|
||||
|
||||
for target in $targets
|
||||
do
|
||||
sources="$MODULES $target"
|
||||
inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
|
||||
rm -f bin/${target}.out
|
||||
nvcc $ccargs -o bin/${target}.out $modules $inputs
|
||||
done
|
1132
gpu/tp5/c/src/main.cu
Normal file
1132
gpu/tp5/c/src/main.cu
Normal file
File diff suppressed because it is too large
Load diff
198
gpu/tp5/c/src/matrix.cu
Normal file
198
gpu/tp5/c/src/matrix.cu
Normal file
|
@ -0,0 +1,198 @@
|
|||
#include "matrix.h"
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#define SEP ;
|
||||
#define RANGE(I, FROM, TO) size_t I = FROM SEP I < TO SEP I += 1
|
||||
|
||||
//
|
||||
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
||||
//
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// step 01
|
||||
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
|
||||
//
|
||||
__host__ __device__ int index1(int i, int j, int rows, int cols) {
|
||||
if (i < 0 || i >= rows) return -1;
|
||||
if (j < 0 || j >= cols) return -1;
|
||||
return (i * cols) + j;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ inline int get_2d(const T* matrix, size_t x, size_t y, size_t width, size_t height) {
|
||||
auto index = index1(y, x, height, width);
|
||||
return matrix[index];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ inline void set_2d(T* matrix, int item, size_t x, size_t y, size_t width, size_t height) {
|
||||
auto index = index1(y, x, height, width);
|
||||
matrix[index] = item;
|
||||
}
|
||||
|
||||
//
|
||||
// CPU
|
||||
//
|
||||
std::vector<int> matmul1(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
|
||||
//
|
||||
// step 02
|
||||
//
|
||||
|
||||
auto A_height = N;
|
||||
auto A_width = M;
|
||||
auto B_height = A_width;
|
||||
auto B_width = P;
|
||||
|
||||
auto result = std::vector<int>(N * P);
|
||||
auto result_height = A_height;
|
||||
auto result_width = B_width;
|
||||
|
||||
for (RANGE(x, 0, result_width)) {
|
||||
for (RANGE(y, 0, result_height)) {
|
||||
auto sum = 0;
|
||||
for (RANGE(i, 0, A_width)) {
|
||||
auto item_a = get_2d(A.data(), i, y, A_width, A_height);
|
||||
auto item_b = get_2d(B.data(), x, i, B_width, B_height);
|
||||
sum += (item_a * item_b);
|
||||
}
|
||||
set_2d(result.data(), sum, x, y, result_width, result_height);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace kernel {
|
||||
|
||||
#define THREAD_GID(COORD) ((blockDim.COORD * blockIdx.COORD) + threadIdx.COORD)
|
||||
|
||||
//
|
||||
// step 03
|
||||
//
|
||||
__global__ void matmul2(const int* A, const int* B, int* C, int N, int M, int P) {
|
||||
auto A_height = N;
|
||||
auto A_width = M;
|
||||
auto B_height = A_width;
|
||||
auto B_width = P;
|
||||
|
||||
auto result = C;
|
||||
auto result_height = A_height;
|
||||
auto result_width = B_width;
|
||||
|
||||
auto x = THREAD_GID(x);
|
||||
auto y = THREAD_GID(y);
|
||||
if (x >= result_width) return;
|
||||
if (y >= result_height) return;
|
||||
|
||||
auto sum = 0;
|
||||
for (RANGE(i, 0, A_width)) {
|
||||
auto item_a = get_2d(A, i, y, A_width, A_height);
|
||||
auto item_b = get_2d(B, x, i, B_width, B_height);
|
||||
sum += (item_a * item_b);
|
||||
}
|
||||
set_2d(result, sum, x, y, result_width, result_height);
|
||||
return;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
||||
template <typename T> inline T* cuda_malloc(size_t item_count = 1) {
|
||||
T* result = nullptr;
|
||||
auto size = item_count * sizeof(T);
|
||||
CUDA_CHECK(cudaMalloc(&result, size));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T> inline T* cuda_malloc_copy(const T* source, size_t item_count = 1) {
|
||||
auto result = cuda_malloc<T>(item_count);
|
||||
auto size = item_count * sizeof(T);
|
||||
CUDA_CHECK(cudaMemcpy(result, source, size, cudaMemcpyHostToDevice));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T> inline std::vector<T> cuda_into_host(const T* allocation, size_t item_count = 1) {
|
||||
auto size = item_count * sizeof(T);
|
||||
auto result = std::vector<T>(item_count);
|
||||
CUDA_CHECK(cudaMemcpy(result.data(), allocation, size, cudaMemcpyDeviceToHost));
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
// GPU
|
||||
//
|
||||
std::vector<int> matmul2(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
|
||||
//
|
||||
// step 04
|
||||
//
|
||||
auto A_height = N;
|
||||
auto A_width = M;
|
||||
auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
|
||||
|
||||
auto B_height = A_width;
|
||||
auto B_width = P;
|
||||
auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
|
||||
|
||||
auto result_height = A_height;
|
||||
auto result_width = B_width;
|
||||
auto result_dev = cuda_malloc<int>(A_height * B_width);
|
||||
|
||||
auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
|
||||
auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
|
||||
kernel::matmul2<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
|
||||
|
||||
CUDA_CHECK(cudaFree(A_dev));
|
||||
CUDA_CHECK(cudaFree(B_dev));
|
||||
auto result = cuda_into_host(result_dev, result_width * result_height);
|
||||
cudaFree(result_dev);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace kernel {
|
||||
|
||||
//
|
||||
// step 05
|
||||
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
|
||||
//
|
||||
__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
|
||||
auto local_x = j;
|
||||
auto local_y = i;
|
||||
|
||||
auto local_matrix_width = T;
|
||||
auto base_x = bj * local_matrix_width;
|
||||
auto base_y = bi * local_matrix_width;
|
||||
|
||||
auto x = base_x + local_x;
|
||||
auto y = base_y + local_y;
|
||||
return index1(y, x, rows, cols);
|
||||
}
|
||||
|
||||
//
|
||||
// step 06
|
||||
//
|
||||
__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
|
||||
auto step_count = (); //
|
||||
}
|
||||
|
||||
} // namespace kernel
|
||||
|
||||
//
|
||||
// GPU by bloc
|
||||
//
|
||||
std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
|
||||
//
|
||||
// step 07
|
||||
//
|
||||
std::vector<int> C(N * P);
|
||||
|
||||
return C;
|
||||
}
|
31
gpu/tp5/c/src/matrix.h
Normal file
31
gpu/tp5/c/src/matrix.h
Normal file
|
@ -0,0 +1,31 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
constexpr int threads_per_bloc = 16;
|
||||
constexpr int T = threads_per_bloc;
|
||||
|
||||
//
|
||||
// CPU
|
||||
//
|
||||
std::vector<int> matmul1(
|
||||
const std::vector<int>& A,
|
||||
const std::vector<int>& B,
|
||||
int N, int M, int P);
|
||||
|
||||
//
|
||||
// GPU
|
||||
//
|
||||
std::vector<int> matmul2(
|
||||
const std::vector<int>& A,
|
||||
const std::vector<int>& B,
|
||||
int N, int M, int P);
|
||||
|
||||
//
|
||||
// GPU by bloc
|
||||
//
|
||||
std::vector<int> matmul3(
|
||||
const std::vector<int>& A,
|
||||
const std::vector<int>& B,
|
||||
int N, int M, int P);
|
Loading…
Add table
Add a link
Reference in a new issue