This commit is contained in:
JOLIMAITRE Matthieu 2024-03-28 17:58:33 +01:00
parent 8b3bb9c382
commit d976cfaf74
37 changed files with 2669 additions and 371 deletions

9
gpu/tp2/.clang-format Normal file
View file

@ -0,0 +1,9 @@
---
BasedOnStyle: LLVM
DerivePointerAlignment: 'false'
IndentWidth: '4'
PointerAlignment: Left
TabWidth: '4'
UseTab: Always
...

4
gpu/tp2/.clangd Normal file
View file

@ -0,0 +1,4 @@
CompileFlags:
Add:
- -xcuda
- --no-cuda-version-check

15
gpu/tp2/.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,15 @@
{
"files.associations": {
"iostream": "cpp",
"atomic": "cpp",
"*.tcc": "cpp",
"chrono": "cpp",
"compare": "cpp",
"optional": "cpp",
"future": "cpp",
"numeric": "cpp",
"sstream": "cpp",
"cmath": "cpp",
"*.def": "cpp"
}
}

14
gpu/tp2/README.md Normal file
View file

@ -0,0 +1,14 @@
# TP 2
> `Matthieu JOLIMAITRE <matthieu.jolimaitre@epita.fr>`
## Questions
1. Pour une matrice de `W × H`, l'index 'aplatis' de l'élément de coordonnées `(x, y)` est `k := (y × W) + x`.
2. Pour une matrice de `W × H`, les coordonnées de l'élément à l'index 'aplatis' `k` sont :
- `x := k % W` (Avec `%` le reste euclidien).
- `y := k // W` (Avec `//` le quotient euclidien).
3. Pour une matrice de 10 lignes de 100 colonnes, contenant des int (4 octets) ; Stocker les lignes avec un pitch devant être multiple de 128 :
- La longueur d'une ligne sera `100 × 4 = 400` octets.
- Le pitch sera le prochain produit de la table de 128 : `(400 // 128) + 1 = 512` octets.
- Le padding sera donc le pitch moins la longueur de la donnée : `512 - 400 = 112` octets.

View file

@ -1,20 +1,25 @@
#!/bin/sh
cd "$(dirname "$(realpath "$0")")"
set -e
alias log="echo '[build.sh]'"
TARGET="ex1.cu ex2.cu ex3.cu ex4.cu"
TARGET="ex1 ex2 ex3"
if [ $# -gt 0 ]
then TARGET=$1
then targets=$@
fi
rm -fr bin
mkdir -p bin
for target in $TARGET
do nvcc src/$target -o bin/${target%.cu}.out
done
ccargs=""
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
for target in $TARGET
do ./bin/${target%.cu}.out
for target in $targets
do
echo ""
nvcc $ccargs -o bin/${target}.out src/${target}.cu
./bin/${target}.out
done

View file

@ -1,14 +1,16 @@
#pragma once
#include <vector>
#include <iostream>
#include <vector>
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char *file, int line) {
if(code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] "
<< cudaGetErrorString(code) << std::endl;
std::abort();
}
}
namespace linalg {
@ -16,55 +18,53 @@ namespace linalg {
//
// Generic matrix of type T (int, float, double...)
//
template<typename T>
class Matrix
{
public:
// construct matrix, allocate the 2D pitched memory on the device
__host__ Matrix(int rows, int cols);
template <typename T> class Matrix {
public:
// construct matrix, allocate the 2D pitched memory on the device
__host__ Matrix(int rows, int cols);
// free allocated device memory
__host__ void free();
// free allocated device memory
__host__ void free();
public:
// copy values from host std::vector to device Matrix
// values must be a vector of size rows x cols
// allocation is already done in the constructor
__host__ void to_cuda(const std::vector<T>& values);
public:
// copy values from host std::vector to device Matrix
// values must be a vector of size rows x cols
// allocation is already done in the constructor
__host__ void to_cuda(const std::vector<T>& values);
// copy values from device Matrix to host std::vector
// values may not ne resized
__host__ void to_cpu(std::vector<T>& values) const;
// copy values from device Matrix to host std::vector
// values may not ne resized
__host__ void to_cpu(std::vector<T>& values) const;
public:
// accessor at row i and column j
__device__ const T& operator()(int i, int j) const;
__device__ T& operator()(int i, int j);
public:
// accessor at row i and column j
__device__ const T& operator()(int i, int j) const;
__device__ T& operator()(int i, int j);
public:
__host__ Matrix operator + (const Matrix<T>& other) const;
__host__ Matrix operator - (const Matrix<T>& other) const;
__host__ Matrix operator * (const Matrix<T>& other) const;
__host__ Matrix operator / (const Matrix<T>& other) const;
public:
__host__ Matrix operator+(const Matrix<T>& other) const;
__host__ Matrix operator-(const Matrix<T>& other) const;
__host__ Matrix operator*(const Matrix<T>& other) const;
__host__ Matrix operator/(const Matrix<T>& other) const;
private:
// apply binary functor f on all pairs of elements
// f must provide the following operator
//
// T operator()(T a, T b)
//
// template<typename BinaryFunctor>
// __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;
private:
// apply binary functor f on all pairs of elements
// f must provide the following operator
//
// T operator()(T a, T b)
//
// template<typename BinaryFunctor>
// __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;
public:
__host__ __device__ inline int rows() const {return m_rows;}
__host__ __device__ inline int cols() const {return m_cols;}
public:
__host__ __device__ inline int rows() const { return m_rows; }
__host__ __device__ inline int cols() const { return m_cols; }
private:
T* m_data_ptr; // device pointer
int m_rows;
int m_cols;
size_t m_pitch;
private:
T* m_data_ptr; // device pointer
int m_rows;
int m_cols;
size_t m_pitch;
};
} // namespace linalg

View file

@ -1,5 +1,12 @@
#pragma once
#include "Matrix.h"
#define RANGE(i, from, to) \
int i = from; \
i < to; \
i += 1
namespace linalg {
namespace kernel {
@ -8,92 +15,111 @@ namespace kernel {
// step 10
// CUDA kernel add
//
template <typename T>
__device__ void add(const Matrix<T>* a, const Matrix<T>* b, Matrix<T>* res) {
auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x >= res->cols())
return;
if (y >= res->rows())
return;
auto a_ref = (const Matrix<T>&)(a);
auto b_ref = (const Matrix<T>&)(b);
auto res_ref = (Matrix<T>&)(res);
auto res_ptr = &res_ref(x, y);
*res_ptr = *(&a_ref(x, y)) + *b_ref(x, y);
}
//
// step 12
// CUDA kernel apply
//
} // namespace kernel
template<typename T>
__host__ Matrix<T>::Matrix(int rows, int cols) :
m_data_ptr(nullptr),
m_rows(rows),
m_cols(cols),
m_pitch(0)
{
// step 07
template <typename T>
__host__ Matrix<T>::Matrix(int rows, int cols)
: m_data_ptr(nullptr), m_rows(rows), m_cols(cols), m_pitch(0) {
auto line_width = cols * sizeof(T);
// step 07
cudaMallocPitch(&this->m_data_ptr, &this->m_pitch, //
line_width, rows //
);
}
template<typename T>
__host__ void Matrix<T>::free()
{
// step 07
template <typename T> __host__ void Matrix<T>::free() {
// step 07
cudaFree(this->m_data_ptr);
}
template<typename T>
__host__ void Matrix<T>::to_cuda(const std::vector<T>& values)
{
// step 08
template <typename T>
__host__ void Matrix<T>::to_cuda(const std::vector<T>& values) {
// step 08
auto vec_line_width = this->m_cols * sizeof(T);
auto vec_arr_ptr = &values.front();
cudaMemcpy2D(this->m_data_ptr, this->m_pitch, vec_arr_ptr, vec_line_width,
vec_line_width, this->m_rows, cudaMemcpyHostToDevice);
}
template<typename T>
__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const
{
// step 08
template <typename T>
__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const {
// step 08
auto vec_line_width = this->m_cols * sizeof(T);
auto vec_arr_ptr = &values.front();
cudaMemcpy2D(vec_arr_ptr, vec_line_width, this->m_data_ptr, this->m_pitch,
vec_line_width, this->m_rows, cudaMemcpyDeviceToHost);
}
template<typename T>
__device__ const T& Matrix<T>::operator()(int i, int j) const
{
// step 09
template <typename T>
__device__ const T& Matrix<T>::operator()(int i, int j) const {
// step 09
if (i >= this->m_cols)
return NULL;
if (j >= this->m_rows)
return NULL;
auto offset = (j * this->m_pitch) + (i * sizeof(T));
auto base_ptr = (u_int8_t*)(this->m_data_ptr);
auto result = base_ptr + offset;
return (const T&)(result);
}
template<typename T>
__device__ T& Matrix<T>::operator()(int i, int j)
{
// step 09
template <typename T> __device__ T& Matrix<T>::operator()(int i, int j) {
// step 09
// if (i >= this->m_cols)
// return nullptr;
// if (j >= this->m_rows)
// return nullptr;
auto offset = (j * this->m_pitch) + (i * sizeof(T));
auto base_ptr = (u_int8_t*)(this->m_data_ptr);
auto result = base_ptr + offset;
return (T&)(result);
}
template<typename T>
__host__ Matrix<T> Matrix<T>::operator + (const Matrix<T>& other) const
{
// step 11
template <typename T>
__host__ Matrix<T> Matrix<T>::operator+(const Matrix<T>& other) const {
// step 11
auto width = min(this->m_cols, other.m_cols);
auto height = min(this->m_rows, other.m_rows);
auto res = Matrix<T>(width, height);
auto threads_per_block = dim3(32, 32, 1);
auto blocks = dim3(width / 32 + 1, height / 32 + 1, 1);
kernel::add<T><<<blocks, threads_per_block>>>(this, &other, &res);
return res;
}
template<typename T>
__host__ Matrix<T> Matrix<T>::operator - (const Matrix<T>& other) const
{
// step 12
template <typename T>
__host__ Matrix<T> Matrix<T>::operator-(const Matrix<T>& other) const {
// step 12
}
template<typename T>
__host__ Matrix<T> Matrix<T>::operator * (const Matrix<T>& other) const
{
// step 12
template <typename T>
__host__ Matrix<T> Matrix<T>::operator*(const Matrix<T>& other) const {
// step 12
}
template<typename T>
__host__ Matrix<T> Matrix<T>::operator / (const Matrix<T>& other) const
{
// step 12
template <typename T>
__host__ Matrix<T> Matrix<T>::operator/(const Matrix<T>& other) const {
// step 12
}
} // namespace linalg

View file

@ -3,76 +3,97 @@
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char *file, int line) {
if(code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] "
<< cudaGetErrorString(code) << std::endl;
std::abort();
}
}
//
// step 01
// return the linear index corresponding to the element at row i and column j
// in a matrix of size rows x cols, using row-major storage
//
__device__ int linear_index(int i, int j, int rows, int cols) {
if (i >= rows)
return -1;
if (j >= cols)
return -1;
return i * cols + j;
}
//
// step 02
// CUDA kernel add
//
int main()
{
constexpr int rows = 200;
constexpr int cols = 80;
int* x = (int*)malloc(rows*cols*sizeof(int));
int* y = (int*)malloc(rows*cols*sizeof(int));
for(int i = 0; i < rows*cols; ++i) {
x[i] = i;
y[i] = std::pow(-1,i) * i;
}
//
// step 03
//
int* dx;
int* dy;
// 1. allocate on device
// 2. copy from host to device
// 3. launch CUDA kernel
// const dim3 threads_per_bloc{32,32,1};
// 4. copy result from device to host
// 5. free device memory
// checking results
bool ok = true;
for(int i = 0; i < rows*cols; ++i) {
const int expected_result = std::pow(-1,i) * i + i;
if(y[i] != expected_result) {
std::cout << "Failure" << std::endl;
std::cout << "Result at index i="
<< i << ": expected "
<< std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
ok = false;
break;
}
}
if(ok) std::cout << "Success" << std::endl;
free(x);
free(y);
return 0;
// CUDA kernel add
__global__ void add(const int* dx, int* dy, int rows, int cols) {
auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
auto index = linear_index(j, i, rows, cols);
if (index == -1)
return;
auto res = dx[index] + dy[index];
dy[index] = res;
}
int main() {
constexpr int rows = 200;
constexpr int cols = 80;
int* x = (int*)malloc(rows * cols * sizeof(int));
int* y = (int*)malloc(rows * cols * sizeof(int));
for (int i = 0; i < rows * cols; ++i) {
x[i] = i;
y[i] = std::pow(-1, i) * i;
}
//
// step 03
//
int* dx;
int* dy;
// 1. allocate on device
auto size = rows * cols * sizeof(int);
CUDA_CHECK(cudaMalloc(&dx, size));
CUDA_CHECK(cudaMalloc(&dy, size));
// 2. copy from host to device
CUDA_CHECK(cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice));
// 3. launch CUDA kernel
const dim3 threads_per_bloc{32, 32, 1};
auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
add<<<blocks, threads_per_bloc>>>(dx, dy, rows, cols);
// 4. copy result from device to host
CUDA_CHECK(cudaMemcpy(x, dx, size, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost));
// 5. free device memory
CUDA_CHECK(cudaFree(dx));
CUDA_CHECK(cudaFree(dy));
// checking results
bool ok = true;
for (int i = 0; i < rows * cols; ++i) {
const int expected_result = std::pow(-1, i) * i + i;
if (y[i] != expected_result) {
std::cout << "Failure" << std::endl;
std::cout << "Result at index i=" << i << ": expected "
<< std::pow(-1, i) * i << '+' << i << '='
<< expected_result << ", got " << y[i] << std::endl;
ok = false;
break;
}
}
if (ok)
std::cout << "Success" << std::endl;
free(x);
free(y);
return 0;
}

View file

@ -3,78 +3,110 @@
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char *file, int line) {
if(code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] "
<< cudaGetErrorString(code) << std::endl;
std::abort();
}
}
#define FMTVEC3(X) "(" << X.x << "," << X.y << "," << X.z << ")"
//
// step 04
// return a pointer to the value at row i and column j from base_address
// return a pointer to the value at row i and column j from base_address
// with pitch in bytes
//
__device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) {
auto offset = i * pitch + (j * sizeof(int));
auto ptr = (char*)base_address;
return (int*)(ptr + offset);
}
//
// step 05
// CUDA kernel add
//
int main()
{
constexpr int rows = 200;
constexpr int cols = 80;
int* x = (int*)malloc(rows*cols*sizeof(int));
int* y = (int*)malloc(rows*cols*sizeof(int));
for(int i = 0; i < rows*cols; ++i) {
x[i] = i;
y[i] = std::pow(-1,i) * i;
}
//
// step 06
//
int* dx;
int* dy;
size_t pitch;
// 1. allocate on device
// 2. copy from host to device
// 3. launch CUDA kernel
// const dim3 threads_per_bloc{32,32,1};
// 4. copy result from device to host
// 5. free device memory
// checking results
bool ok = true;
for(int i = 0; i < rows*cols; ++i) {
const int expected_result = std::pow(-1,i) * i + i;
if(y[i] != expected_result) {
std::cout << "Failure" << std::endl;
std::cout << "Result at index i="
<< i << ": expected "
<< std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
ok = false;
break;
}
}
if(ok) std::cout << "Success" << std::endl;
free(x);
free(y);
return 0;
// CUDA kernel add
__global__ void add_(int* a, int* b, size_t pitch, size_t width,
size_t height) {
auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x >= width)
return;
if (y >= height)
return;
auto ptr_a = get_ptr(a, y, x, pitch);
auto ptr_b = get_ptr(b, y, x, pitch);
auto res = *ptr_a + *ptr_b;
*ptr_b = res;
}
int main() {
constexpr int rows = 200;
constexpr int cols = 80;
int* x = (int*)malloc(rows * cols * sizeof(int));
int* y = (int*)malloc(rows * cols * sizeof(int));
for (int i = 0; i < rows * cols; ++i) {
x[i] = i;
y[i] = std::pow(-1, i) * i;
}
//
// step 06
//
int* dx;
int* dy;
size_t pitch;
// 1. allocate on device
CUDA_CHECK(cudaMallocPitch(&dx, &pitch, cols * sizeof(int), rows));
CUDA_CHECK(cudaMallocPitch(&dy, &pitch, cols * sizeof(int), rows));
// 2. copy from host to device
auto arr_width = cols * sizeof(int);
CUDA_CHECK(cudaMemcpy2D(dx, pitch, //
x, arr_width, //
cols * sizeof(int), rows, //
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy2D(dy, pitch, //
y, arr_width, //
cols * sizeof(int), rows, //
cudaMemcpyHostToDevice));
// 3. launch CUDA kernel
const auto threads_per_bloc = dim3(32, 32, 1);
const auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
add_<<<blocks, threads_per_bloc>>>(dx, dy, pitch, cols, rows);
// 4. copy result from device to host
CUDA_CHECK(cudaMemcpy2D(y, arr_width, //
dy, pitch, //
cols * sizeof(int), rows, //
cudaMemcpyDeviceToHost));
// 5. free device memory
cudaFree(dx);
cudaFree(dy);
// checking results
bool ok = true;
for (int i = 0; i < rows * cols; ++i) {
const int expected_result = std::pow(-1, i) * i + i;
if (y[i] != expected_result) {
std::cout << "Failure" << std::endl;
std::cout << "Result at index i=" << i << ": expected "
<< std::pow(-1, i) * i << '+' << i << '='
<< expected_result << ", got " << y[i] << std::endl;
ok = false;
break;
}
}
if (ok)
std::cout << "Success" << std::endl;
free(x);
free(y);
return 0;
}

View file

@ -1,148 +1,171 @@
#include "Matrix.h"
int main()
{
{
const int rows = 4;
const int cols = 4;
// instantiate two matrices of integers on the device
linalg::Matrix<int> A(rows, cols);
linalg::Matrix<int> B(rows, cols);
// fill the two matrices
A.to_cuda({ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16});
B.to_cuda({16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
int main() {
{
const int rows = 4;
const int cols = 4;
// instantiate two matrices of integers on the device
linalg::Matrix<int> A(rows, cols);
linalg::Matrix<int> B(rows, cols);
// fill the two matrices
A.to_cuda({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
B.to_cuda({16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
// compute the sum
auto C = A + B;
// compute the sum
auto C = A + B;
// transfert the result on the host
std::vector<int> c_res;
C.to_cpu(c_res);
C.free();
// check results
const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
if(c_res != c_expected) {
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
std::cout << " expected: ";
for(int i : c_expected) std::cout << i << " ";
std::cout << std::endl;
std::cout << " got: ";
for(int i : c_res) std::cout << i << " ";
std::cout << std::endl;
} else {
std::cout << "Success" << std::endl;
}
// transfert the result on the host
std::vector<int> c_res;
C.to_cpu(c_res);
C.free();
// compute the difference
auto D = A - B;
// check results
const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 17};
if (c_res != c_expected) {
std::cout << __FILE__ << ":" << __LINE__
<< ": Failure (+):" << std::endl;
std::cout << " expected: ";
for (int i : c_expected)
std::cout << i << " ";
std::cout << std::endl;
std::cout << " got: ";
for (int i : c_res)
std::cout << i << " ";
std::cout << std::endl;
} else {
std::cout << "Success" << std::endl;
}
// transfert the result on the host
std::vector<int> d_res;
D.to_cpu(d_res);
D.free();
// compute the difference
auto D = A - B;
// check results
const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15};
if(d_res != d_expected) {
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
std::cout << " expected: ";
for(int i : d_expected) std::cout << i << " ";
std::cout << std::endl;
std::cout << " got: ";
for(int i : d_res) std::cout << i << " ";
std::cout << std::endl;
} else {
std::cout << "Success" << std::endl;
}
}
// ------------------------------------------------------------------------
{
const int rows = 89;
const int cols = 128;
linalg::Matrix<float> A(rows, cols);
linalg::Matrix<float> B(rows, cols);
std::vector<float> a_values(rows*cols);
std::vector<float> b_values(rows*cols);
for(int i = 0; i < rows*cols; ++i) {
a_values[i] = 1 + float(i) / 100;
b_values[i] = std::pow(-1, i) * float(i)/(rows*cols) * 100;
}
A.to_cuda(a_values);
B.to_cuda(b_values);
// transfert the result on the host
std::vector<int> d_res;
D.to_cpu(d_res);
D.free();
auto C = A + B;
auto D = A - B;
auto E = A * B;
auto F = A / B;
// check results
const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1,
1, 3, 5, 7, 9, 11, 13, 15};
if (d_res != d_expected) {
std::cout << __FILE__ << ":" << __LINE__
<< ": Failure (-):" << std::endl;
std::cout << " expected: ";
for (int i : d_expected)
std::cout << i << " ";
std::cout << std::endl;
std::cout << " got: ";
for (int i : d_res)
std::cout << i << " ";
std::cout << std::endl;
} else {
std::cout << "Success" << std::endl;
}
}
// ------------------------------------------------------------------------
{
const int rows = 89;
const int cols = 128;
linalg::Matrix<float> A(rows, cols);
linalg::Matrix<float> B(rows, cols);
std::vector<float> a_values(rows * cols);
std::vector<float> b_values(rows * cols);
for (int i = 0; i < rows * cols; ++i) {
a_values[i] = 1 + float(i) / 100;
b_values[i] = std::pow(-1, i) * float(i) / (rows * cols) * 100;
}
A.to_cuda(a_values);
B.to_cuda(b_values);
std::vector<float> c_values;
C.to_cpu(c_values);
std::vector<float> d_values;
D.to_cpu(d_values);
std::vector<float> e_values;
E.to_cpu(e_values);
std::vector<float> f_values;
F.to_cpu(f_values);
auto C = A + B;
auto D = A - B;
auto E = A * B;
auto F = A / B;
C.free();
D.free();
E.free();
F.free();
std::vector<float> c_values;
C.to_cpu(c_values);
std::vector<float> d_values;
D.to_cpu(d_values);
std::vector<float> e_values;
E.to_cpu(e_values);
std::vector<float> f_values;
F.to_cpu(f_values);
const float epsilon = 0.001;
bool ok = true;
for(int i = 0; i < rows*cols; ++i) {
const float diff = std::abs( c_values[i] - (a_values[i] + b_values[i]) );
if(diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
std::cout << " expected: " << a_values[i] + b_values[i] << std::endl;
std::cout << " got: " << c_values[i] << std::endl;
ok = false;
break;
}
}
if(ok) std::cout << "Success" << std::endl;
ok = true;
for(int i = 0; i < rows*cols; ++i) {
const float diff = std::abs( d_values[i] - (a_values[i] - b_values[i]) );
if(diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
std::cout << " expected: " << a_values[i] - b_values[i] << std::endl;
std::cout << " got: " << d_values[i] << std::endl;
ok = false;
break;
}
}
if(ok) std::cout << "Success" << std::endl;
C.free();
D.free();
E.free();
F.free();
ok = true;
for(int i = 0; i < rows*cols; ++i) {
const float diff = std::abs( e_values[i] - (a_values[i] * b_values[i]) );
if(diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (*):" << std::endl;
std::cout << " expected: " << a_values[i] * b_values[i] << std::endl;
std::cout << " got: " << e_values[i] << std::endl;
ok = false;
break;
}
}
if(ok) std::cout << "Success" << std::endl;
const float epsilon = 0.001;
bool ok = true;
for (int i = 0; i < rows * cols; ++i) {
const float diff =
std::abs(c_values[i] - (a_values[i] + b_values[i]));
if (diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__
<< ": Failure (+):" << std::endl;
std::cout << " expected: " << a_values[i] + b_values[i]
<< std::endl;
std::cout << " got: " << c_values[i] << std::endl;
ok = false;
break;
}
}
if (ok)
std::cout << "Success" << std::endl;
ok = true;
for(int i = 0; i < rows*cols; ++i) {
const float diff = std::abs( f_values[i] - (a_values[i] / b_values[i]) );
if(diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (/):" << std::endl;
std::cout << " expected: " << a_values[i] / b_values[i] << std::endl;
std::cout << " got: " << f_values[i] << std::endl;
ok = false;
break;
}
}
if(ok) std::cout << "Success" << std::endl;
}
ok = true;
for (int i = 0; i < rows * cols; ++i) {
const float diff =
std::abs(d_values[i] - (a_values[i] - b_values[i]));
if (diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__
<< ": Failure (-):" << std::endl;
std::cout << " expected: " << a_values[i] - b_values[i]
<< std::endl;
std::cout << " got: " << d_values[i] << std::endl;
ok = false;
break;
}
}
if (ok)
std::cout << "Success" << std::endl;
return 0;
ok = true;
for (int i = 0; i < rows * cols; ++i) {
const float diff =
std::abs(e_values[i] - (a_values[i] * b_values[i]));
if (diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__
<< ": Failure (*):" << std::endl;
std::cout << " expected: " << a_values[i] * b_values[i]
<< std::endl;
std::cout << " got: " << e_values[i] << std::endl;
ok = false;
break;
}
}
if (ok)
std::cout << "Success" << std::endl;
ok = true;
for (int i = 0; i < rows * cols; ++i) {
const float diff =
std::abs(f_values[i] - (a_values[i] / b_values[i]));
if (diff > epsilon) {
std::cout << __FILE__ << ":" << __LINE__
<< ": Failure (/):" << std::endl;
std::cout << " expected: " << a_values[i] / b_values[i]
<< std::endl;
std::cout << " got: " << f_values[i] << std::endl;
ok = false;
break;
}
}
if (ok)
std::cout << "Success" << std::endl;
}
return 0;
}

13
gpu/tp3/.clang-format Normal file
View file

@ -0,0 +1,13 @@
# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
---
BasedOnStyle: LLVM
DerivePointerAlignment: false
IndentWidth: 4
PointerAlignment: Left
TabWidth: 4
UseTab: Always
AllowShortIfStatementsOnASingleLine: AllIfsAndElse
AllowShortLoopsOnASingleLine: true
ColumnLimit: 120
AllowShortBlocksOnASingleLine: Always
AllowShortFunctionsOnASingleLine: All

4
gpu/tp3/.clangd Normal file
View file

@ -0,0 +1,4 @@
CompileFlags:
Add:
- -xcuda
- --no-cuda-version-check

2
gpu/tp3/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
bin/
/*.zip

3
gpu/tp3/README.md Normal file
View file

@ -0,0 +1,3 @@
# TP 3
> `Matthieu JOLIMAITRE <matthieu.jolimaitre@epita.fr>`

25
gpu/tp3/c/build.sh Executable file
View file

@ -0,0 +1,25 @@
#!/bin/sh
cd "$(dirname "$(realpath "$0")")"
set -e
alias log="echo '[build.sh]'"
TARGET="ex1 ex2"
if [ $# -gt 0 ]
then targets=$@
fi
rm -fr bin
mkdir -p bin
ccargs="-O2"
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
for target in $targets
do
echo ""
nvcc $ccargs -o bin/${target}.out src/${target}.cu
./bin/${target}.out
done

106
gpu/tp3/c/src/ex1.cu Normal file
View file

@ -0,0 +1,106 @@
#include <cstddef>
#include <iostream>
#define RANGE(I, FROM, TO) \
size_t I = FROM; \
I < TO; \
I += 1
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
constexpr int bloc_count = 128; // constexpr equivalent to blockDim.x in CUDA kernel
constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x in CUDA kernel
constexpr int B = bloc_count;
constexpr int T = threads_per_bloc;
//
// step 01
//
// dx: array of size N
// dy: array of size N
// dz: array of size B
//
typedef struct {
size_t from;
size_t to;
} StrideRange;
#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
size_t block_id, size_t thread_id) {
auto global_threads = block_dim * grid_dim;
auto items_per_threads = (array_length / global_threads) + 1;
auto global_thread_index = block_id * block_dim + thread_id;
auto from = global_thread_index * items_per_threads;
auto to = from + items_per_threads;
return StrideRange{from, to};
}
__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
__shared__ int buffer[T];
auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
if (range.from >= N) return;
buffer[threadIdx.x] = 0;
for (RANGE(i, range.from, range.to))
if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
__syncthreads();
if (threadIdx.x != 0) return;
dz[blockIdx.x] = 0;
for (RANGE(i, 0, T)) dz[blockIdx.x] += buffer[i];
}
int main() {
constexpr int N = 1e6;
int* x = (int*)malloc(N * sizeof(int));
int* y = (int*)malloc(N * sizeof(int));
int host_expected_result = 0;
for (int i = 0; i < N; i++) {
x[i] = i % 10;
y[i] = i % 3 - 1;
host_expected_result += x[i] * y[i];
}
// step 02
int *dx, *dy, *dz;
auto size = N * sizeof(int);
auto res_size = B * sizeof(int);
cudaMalloc(&dx, size);
cudaMalloc(&dy, size);
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
cudaMalloc(&dz, res_size);
// step 03
dot<<<B, T>>>(N, dx, dy, dz);
int result = 0;
int* z = (int*)malloc(res_size);
cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
for (RANGE(i, 0, B)) result += z[i];
// checking results
if (host_expected_result == result) {
std::cout << "Success" << std::endl;
} else {
std::cout << "Error" << std::endl;
std::cout << " expected: " << host_expected_result << std::endl;
std::cout << " got: " << result << std::endl;
}
free(x);
free(y);
return 0;
}

119
gpu/tp3/c/src/ex2.cu Normal file
View file

@ -0,0 +1,119 @@
#include <iostream>
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
constexpr int bloc_count = 128; // constexpr equivalent to blockDim.x in CUDA kernel
constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x in CUDA kernel
constexpr int B = bloc_count;
constexpr int T = threads_per_bloc;
//
// step 04
//
// dx: array of size N
// dy: array of size N
// dz: array of size B
//
#define RANGE(I, FROM, TO) \
size_t I = FROM; \
I < TO; \
I += 1
#define loop while (1)
typedef struct {
size_t from;
size_t to;
} StrideRange;
#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
size_t block_id, size_t thread_id) {
auto global_threads = block_dim * grid_dim;
auto items_per_threads = (array_length / global_threads) + 1;
auto global_thread_index = block_id * block_dim + thread_id;
auto from = global_thread_index * items_per_threads;
auto to = from + items_per_threads;
return StrideRange{from, to};
}
__device__ void reduce_rec(int N, int* array) {
auto length = N;
auto thread_id = threadIdx.x;
loop {
if (length <= 1) return;
auto half = length / 2;
auto used_threads = half;
if (thread_id >= used_threads) return;
__syncthreads();
array[thread_id] += array[thread_id + half];
length = half;
}
}
__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
__shared__ int buffer[T];
auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
if (range.from >= N) return;
buffer[threadIdx.x] = 0;
for (RANGE(i, range.from, range.to))
if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
reduce_rec(T, buffer);
if (threadIdx.x != 0) return;
dz[blockIdx.x] = buffer[0];
}
int main() {
constexpr int N = 1e6;
int* x = (int*)malloc(N * sizeof(int));
int* y = (int*)malloc(N * sizeof(int));
int host_expected_result = 0;
for (int i = 0; i < N; i++) {
x[i] = i % 10;
y[i] = i % 3 - 1;
host_expected_result += x[i] * y[i];
}
// step 05
int result = 0;
int *dx, *dy, *dz;
auto size = N * sizeof(int);
auto res_size = B * sizeof(int);
cudaMalloc(&dx, size);
cudaMalloc(&dy, size);
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
cudaMalloc(&dz, res_size);
dot<<<B, T>>>(N, dx, dy, dz);
int* z;
z = (int*)malloc(res_size);
cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
for (RANGE(i, 0, B)) result += z[i];
// checking results
if (host_expected_result == result) {
std::cout << "Success" << std::endl;
} else {
std::cout << "Error" << std::endl;
std::cout << " expected: " << host_expected_result << std::endl;
std::cout << " got: " << result << std::endl;
}
free(x);
free(y);
return 0;
}

BIN
gpu/tp3/tp3.pdf Normal file

Binary file not shown.

13
gpu/tp4/.clang-format Normal file
View file

@ -0,0 +1,13 @@
# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
---
BasedOnStyle: LLVM
DerivePointerAlignment: false
IndentWidth: 4
PointerAlignment: Left
TabWidth: 4
UseTab: Always
AllowShortIfStatementsOnASingleLine: AllIfsAndElse
AllowShortLoopsOnASingleLine: true
ColumnLimit: 120
AllowShortBlocksOnASingleLine: Always
AllowShortFunctionsOnASingleLine: All

4
gpu/tp4/.clangd Normal file
View file

@ -0,0 +1,4 @@
CompileFlags:
Add:
- -xcuda
- --no-cuda-version-check

2
gpu/tp4/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
bin/
/*.zip

1
gpu/tp4/README.md Normal file
View file

@ -0,0 +1 @@
#

26
gpu/tp4/c/build.sh Executable file
View file

@ -0,0 +1,26 @@
#!/bin/sh
cd "$(dirname "$(realpath "$0")")"
set -e
alias log="echo '[build.sh]'"
TARGET="ex1.cu ex2.cu ex3.cu"
MODULES="conv.cu"
if [ $# -gt 0 ]
then targets="$@"
else targets="$TARGET"
fi
rm -fr bin
mkdir -p bin
ccargs="-O2"
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
for target in $targets
do
sources="$MODULES $target"
inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
nvcc $ccargs -o bin/${target}.out $modules $inputs
./bin/${target}.out
done

183
gpu/tp4/c/src/conv.cu Normal file
View file

@ -0,0 +1,183 @@
#include "conv.h"
constexpr int threads_per_bloc = 16;
constexpr int T = threads_per_bloc;
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
#define RANGE(X, FROM, TO) \
long X = FROM; \
X < TO; \
X += 1
#define THREAD_ID_X() (blockIdx.x * blockDim.x + threadIdx.x);
#define DBG(X, FMT) printf(#X ": %" FMT "\n", X);
#define DBG_S(X) #X << ": " << X << ", "
//
// 1D convolution
// - x: input array of size N
// - y: kernel of odd size M
//
// CPU
//
std::vector<int> conv1(const std::vector<int>& x, const std::vector<int>& y) {
//
// step 01
//
const int N = x.size();
const int M = y.size();
const int P = (M - 1) / 2;
auto z = std::vector<int>(N);
for (RANGE(result_index, 0, x.size())) {
auto result = 0;
for (RANGE(y_index, 0, y.size())) {
auto x_index = result_index - P + y_index;
if (x_index < 0 || x_index >= x.size()) continue;
result += x.at(x_index) * y.at(y_index);
}
z.at(result_index) = result;
}
return z;
}
namespace kernel {
//
// step 02
//
__global__ void conv2(const int* dx, const int* dy, int x_length, int y_length, int* dz) {
auto thread_id = (long)THREAD_ID_X();
if (thread_id >= x_length) return;
auto offset = (y_length - 1) / 2;
auto result_index = thread_id;
auto result = 0;
for (RANGE(y_index, 0, y_length)) {
auto x_index = result_index - offset + y_index;
if (x_index < 0 || x_index >= x_length) continue;
result += dx[x_index] * dy[y_index];
}
dz[result_index] = result;
}
} // namespace kernel
//
// 1D convolution
// - x: input array of size N
// - y: kernel of odd size M
//
// GPU (naive)
//
std::vector<int> conv2(const std::vector<int>& x, const std::vector<int>& y) {
//
// step 03
//
auto dx = (int*)nullptr;
auto size_dx = x.size() * sizeof(int);
cudaMalloc(&dx, size_dx);
cudaMemcpy(dx, x.data(), size_dx, cudaMemcpyHostToDevice);
auto dy = (int*)nullptr;
auto size_dy = y.size() * sizeof(int);
cudaMalloc(&dy, size_dy);
cudaMemcpy(dy, y.data(), size_dy, cudaMemcpyHostToDevice);
auto dz = (int*)nullptr;
auto size_dz = x.size() * sizeof(int);
cudaMalloc(&dz, size_dz);
auto blocks = x.size() / threads_per_bloc + 1;
kernel::conv2<<<blocks, threads_per_bloc>>>(dx, dy, x.size(), y.size(), dz);
cudaFree(dx);
cudaFree(dy);
auto z = std::vector<int>(x.size());
cudaMemcpy(z.data(), dz, size_dz, cudaMemcpyDeviceToHost);
cudaFree(dz);
return z;
}
namespace kernel {
//
// step 04
//
__global__ void conv3(const int* dx, const int* dy, int x_length, int y_length, int* dz) {
__shared__ int buffer[T];
auto thread_id = (long)THREAD_ID_X();
if (thread_id >= x_length) return;
buffer[thread_id % T] = dx[thread_id];
__syncthreads();
auto buffer_lower_x_index = (thread_id / T) * T;
auto buffer_upper_x_index = buffer_lower_x_index + T;
auto offset = (y_length - 1) / 2;
auto result_index = thread_id;
auto result = 0;
for (RANGE(y_index, 0, y_length)) {
auto x_index = result_index - offset + y_index;
if (x_index < 0 || x_index >= x_length) continue;
auto in_buffer = x_index >= buffer_lower_x_index && x_index < buffer_upper_x_index;
if (in_buffer) {
auto buff_index = x_index - buffer_lower_x_index;
result += buffer[buff_index] * dy[y_index];
} else result += dx[x_index] * dy[y_index];
}
dz[result_index] = result;
}
} // namespace kernel
//
// 1D convolution
// - x: input array of size N
// - y: kernel of odd size M
//
// GPU (optimized)
//
std::vector<int> conv3(const std::vector<int>& x, const std::vector<int>& y) {
//
// step 05
//
auto dx = (int*)nullptr;
auto size_dx = x.size() * sizeof(int);
cudaMalloc(&dx, size_dx);
cudaMemcpy(dx, x.data(), size_dx, cudaMemcpyHostToDevice);
auto dy = (int*)nullptr;
auto size_dy = y.size() * sizeof(int);
cudaMalloc(&dy, size_dy);
cudaMemcpy(dy, y.data(), size_dy, cudaMemcpyHostToDevice);
auto dz = (int*)nullptr;
auto size_dz = x.size() * sizeof(int);
cudaMalloc(&dz, size_dz);
auto blocks = x.size() / threads_per_bloc + 1;
kernel::conv3<<<blocks, threads_per_bloc>>>(dx, dy, x.size(), y.size(), dz);
cudaFree(dx);
cudaFree(dy);
auto z = std::vector<int>(x.size());
cudaMemcpy(z.data(), dz, size_dz, cudaMemcpyDeviceToHost);
cudaFree(dz);
return z;
}

19
gpu/tp4/c/src/conv.h Normal file
View file

@ -0,0 +1,19 @@
#pragma once
#include <iostream>
#include <vector>
//
// 1D convolution
// - x: input array of size N
// - y: kernel of odd size M
//
// CPU
std::vector<int> conv1(const std::vector<int>& x, const std::vector<int>& y);
// GPU (naive)
std::vector<int> conv2(const std::vector<int>& x, const std::vector<int>& y);
// GPU (optimized)
std::vector<int> conv3(const std::vector<int>& x, const std::vector<int>& y);

66
gpu/tp4/c/src/ex1.cu Normal file
View file

@ -0,0 +1,66 @@
#include "conv.h"
void print(const std::vector<int>& vec) {
if (vec.empty()) {
std::cout << "[]" << std::endl;
} else {
std::cout << "[";
for (size_t i = 0; i < vec.size() - 1; ++i) std::cout << vec[i] << ", ";
std::cout << vec.back() << "]" << std::endl;
}
}
int main() {
{
std::cout << "Test 1" << std::endl;
const auto x = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
const auto y = std::vector{0, 1, 0}; // M = 3
const auto z_sol = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
const auto z = conv1(x, y);
if (z != z_sol) {
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
} else {
std::cout << "Ok" << std::endl;
}
}
{
std::cout << "Test 2" << std::endl;
const auto x = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
const auto y = std::vector{1, 2, 4, 2, 1}; // M = 5
const auto z_sol = std::vector{4, 11, 20, 30, 40, 50, 60, 70, 70, 59};
const auto z = conv1(x, y);
if (z != z_sol) {
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
} else {
std::cout << "Ok" << std::endl;
}
}
{
std::cout << "Test 3" << std::endl;
const auto x = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; // N = 35
const auto y =
std::vector{1, -2, 4, -8, 16, -32, 64, -128, 256, -1024, 256, -128, 64, -32, 16, -8, 4, -2, 1}; // M = 19
const auto z_sol =
std::vector{117, -736, -1333, -2058, -2719, -3412, -4089, -4774, -5455, -6138, -6820, -7502,
-8184, -8866, -9548, -10230, -10912, -11594, -12276, -12958, -13640, -14322, -15004, -15686,
-16368, -17050, -17767, -18380, -19201, -19606, -20843, -20416, -23317, -19562, -29119};
const auto z = conv1(x, y);
if (z != z_sol) {
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
} else {
std::cout << "Ok" << std::endl;
}
}
return 0;
}

76
gpu/tp4/c/src/ex2.cu Normal file
View file

@ -0,0 +1,76 @@
#include "conv.h"
void print(const std::vector<int>& vec)
{
if(vec.empty())
{
std::cout << "[]" << std::endl;
}
else
{
std::cout << "[";
for(size_t i = 0; i < vec.size()-1; ++i)
std::cout << vec[i] << ", ";
std::cout << vec.back() << "]" << std::endl;
}
}
int main()
{
{
std::cout << "Test 1" << std::endl;
const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
const std::vector<int> y = {0, 1, 0}; // M = 3
const std::vector<int> z_sol = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
const std::vector<int> z = conv2(x, y);
if(z != z_sol)
{
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
}
else
{
std::cout << "Ok" << std::endl;
}
}
{
std::cout << "Test 2" << std::endl;
const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
const std::vector<int> y = {1, 2, 4, 2, 1}; // M = 5
const std::vector<int> z_sol = {4, 11, 20, 30, 40, 50, 60, 70, 70, 59};
const std::vector<int> z = conv2(x, y);
if(z != z_sol)
{
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
}
else
{
std::cout << "Ok" << std::endl;
}
}
{
std::cout << "Test 3" << std::endl;
const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; // N = 35
const std::vector<int> y = {1, -2, 4, -8, 16, -32, 64, -128, 256, -1024, 256, -128, 64, -32, 16, -8, 4, -2, 1}; // M = 19
const std::vector<int> z_sol = {117, -736, -1333, -2058, -2719, -3412, -4089, -4774, -5455, -6138, -6820, -7502, -8184, -8866, -9548, -10230, -10912, -11594, -12276, -12958, -13640, -14322, -15004, -15686, -16368, -17050, -17767, -18380, -19201, -19606, -20843, -20416, -23317, -19562, -29119};
const std::vector<int> z = conv2(x, y);
if(z != z_sol)
{
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
}
else
{
std::cout << "Ok" << std::endl;
}
}
return 0;
}

76
gpu/tp4/c/src/ex3.cu Normal file
View file

@ -0,0 +1,76 @@
#include "conv.h"
void print(const std::vector<int>& vec)
{
if(vec.empty())
{
std::cout << "[]" << std::endl;
}
else
{
std::cout << "[";
for(size_t i = 0; i < vec.size()-1; ++i)
std::cout << vec[i] << ", ";
std::cout << vec.back() << "]" << std::endl;
}
}
int main()
{
{
std::cout << "Test 1" << std::endl;
const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
const std::vector<int> y = {0, 1, 0}; // M = 3
const std::vector<int> z_sol = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
const std::vector<int> z = conv3(x, y);
if(z != z_sol)
{
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
}
else
{
std::cout << "Ok" << std::endl;
}
}
{
std::cout << "Test 2" << std::endl;
const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
const std::vector<int> y = {1, 2, 4, 2, 1}; // M = 5
const std::vector<int> z_sol = {4, 11, 20, 30, 40, 50, 60, 70, 70, 59};
const std::vector<int> z = conv3(x, y);
if(z != z_sol)
{
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
}
else
{
std::cout << "Ok" << std::endl;
}
}
{
std::cout << "Test 3" << std::endl;
const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; // N = 35
const std::vector<int> y = {1, -2, 4, -8, 16, -32, 64, -128, 256, -1024, 256, -128, 64, -32, 16, -8, 4, -2, 1}; // M = 19
const std::vector<int> z_sol = {117, -736, -1333, -2058, -2719, -3412, -4089, -4774, -5455, -6138, -6820, -7502, -8184, -8866, -9548, -10230, -10912, -11594, -12276, -12958, -13640, -14322, -15004, -15686, -16368, -17050, -17767, -18380, -19201, -19606, -20843, -20416, -23317, -19562, -29119};
const std::vector<int> z = conv3(x, y);
if(z != z_sol)
{
std::cout << "Error, expected:" << std::endl;
print(z_sol);
std::cout << "got:" << std::endl;
print(z);
}
else
{
std::cout << "Ok" << std::endl;
}
}
return 0;
}

13
gpu/tp5/.clang-format Normal file
View file

@ -0,0 +1,13 @@
# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
---
BasedOnStyle: LLVM
DerivePointerAlignment: false
IndentWidth: 4
PointerAlignment: Left
TabWidth: 4
UseTab: Always
AllowShortIfStatementsOnASingleLine: AllIfsAndElse
AllowShortLoopsOnASingleLine: true
ColumnLimit: 120
AllowShortBlocksOnASingleLine: Always
AllowShortFunctionsOnASingleLine: All

4
gpu/tp5/.clangd Normal file
View file

@ -0,0 +1,4 @@
CompileFlags:
Add:
- -xcuda
- --no-cuda-version-check

2
gpu/tp5/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
bin/
/*.zip

6
gpu/tp5/.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,6 @@
{
"editor.formatOnType": true,
"[commonlisp]": {
"editor.wordSeparators": "`|;:'\",()"
}
}

25
gpu/tp5/c/build.sh Executable file
View file

@ -0,0 +1,25 @@
#!/bin/sh
cd "$(dirname "$(realpath "$0")")"
set -e
alias log="echo '[build.sh]'"
TARGET="main.cu"
MODULES="matrix.cu"
if [ $# -gt 0 ]
then targets="$@"
else targets="$TARGET"
fi
mkdir -p bin
ccargs="-O2"
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
for target in $targets
do
sources="$MODULES $target"
inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
rm -f bin/${target}.out
nvcc $ccargs -o bin/${target}.out $modules $inputs
done

1132
gpu/tp5/c/src/main.cu Normal file

File diff suppressed because it is too large Load diff

198
gpu/tp5/c/src/matrix.cu Normal file
View file

@ -0,0 +1,198 @@
#include "matrix.h"
#include <cstddef>
#include <cstdlib>
#include <vector>
#define SEP ;
#define RANGE(I, FROM, TO) size_t I = FROM SEP I < TO SEP I += 1
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
//
// step 01
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
//
__host__ __device__ int index1(int i, int j, int rows, int cols) {
if (i < 0 || i >= rows) return -1;
if (j < 0 || j >= cols) return -1;
return (i * cols) + j;
}
template <typename T>
__host__ __device__ inline int get_2d(const T* matrix, size_t x, size_t y, size_t width, size_t height) {
auto index = index1(y, x, height, width);
return matrix[index];
}
template <typename T>
__host__ __device__ inline void set_2d(T* matrix, int item, size_t x, size_t y, size_t width, size_t height) {
auto index = index1(y, x, height, width);
matrix[index] = item;
}
//
// CPU
//
std::vector<int> matmul1(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 02
//
auto A_height = N;
auto A_width = M;
auto B_height = A_width;
auto B_width = P;
auto result = std::vector<int>(N * P);
auto result_height = A_height;
auto result_width = B_width;
for (RANGE(x, 0, result_width)) {
for (RANGE(y, 0, result_height)) {
auto sum = 0;
for (RANGE(i, 0, A_width)) {
auto item_a = get_2d(A.data(), i, y, A_width, A_height);
auto item_b = get_2d(B.data(), x, i, B_width, B_height);
sum += (item_a * item_b);
}
set_2d(result.data(), sum, x, y, result_width, result_height);
}
}
return result;
}
namespace kernel {
#define THREAD_GID(COORD) ((blockDim.COORD * blockIdx.COORD) + threadIdx.COORD)
//
// step 03
//
__global__ void matmul2(const int* A, const int* B, int* C, int N, int M, int P) {
auto A_height = N;
auto A_width = M;
auto B_height = A_width;
auto B_width = P;
auto result = C;
auto result_height = A_height;
auto result_width = B_width;
auto x = THREAD_GID(x);
auto y = THREAD_GID(y);
if (x >= result_width) return;
if (y >= result_height) return;
auto sum = 0;
for (RANGE(i, 0, A_width)) {
auto item_a = get_2d(A, i, y, A_width, A_height);
auto item_b = get_2d(B, x, i, B_width, B_height);
sum += (item_a * item_b);
}
set_2d(result, sum, x, y, result_width, result_height);
return;
}
} // namespace kernel
template <typename T> inline T* cuda_malloc(size_t item_count = 1) {
T* result = nullptr;
auto size = item_count * sizeof(T);
CUDA_CHECK(cudaMalloc(&result, size));
return result;
}
template <typename T> inline T* cuda_malloc_copy(const T* source, size_t item_count = 1) {
auto result = cuda_malloc<T>(item_count);
auto size = item_count * sizeof(T);
CUDA_CHECK(cudaMemcpy(result, source, size, cudaMemcpyHostToDevice));
return result;
}
template <typename T> inline std::vector<T> cuda_into_host(const T* allocation, size_t item_count = 1) {
auto size = item_count * sizeof(T);
auto result = std::vector<T>(item_count);
CUDA_CHECK(cudaMemcpy(result.data(), allocation, size, cudaMemcpyDeviceToHost));
return result;
}
//
// GPU
//
std::vector<int> matmul2(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 04
//
auto A_height = N;
auto A_width = M;
auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
auto B_height = A_width;
auto B_width = P;
auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
auto result_height = A_height;
auto result_width = B_width;
auto result_dev = cuda_malloc<int>(A_height * B_width);
auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
kernel::matmul2<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
CUDA_CHECK(cudaFree(A_dev));
CUDA_CHECK(cudaFree(B_dev));
auto result = cuda_into_host(result_dev, result_width * result_height);
cudaFree(result_dev);
return result;
}
namespace kernel {
//
// step 05
// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
//
__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
auto local_x = j;
auto local_y = i;
auto local_matrix_width = T;
auto base_x = bj * local_matrix_width;
auto base_y = bi * local_matrix_width;
auto x = base_x + local_x;
auto y = base_y + local_y;
return index1(y, x, rows, cols);
}
//
// step 06
//
__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
auto step_count = (); //
}
} // namespace kernel
//
// GPU by bloc
//
std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
//
// step 07
//
std::vector<int> C(N * P);
return C;
}

31
gpu/tp5/c/src/matrix.h Normal file
View file

@ -0,0 +1,31 @@
#pragma once
#include <vector>
#include <iostream>
constexpr int threads_per_bloc = 16;
constexpr int T = threads_per_bloc;
//
// CPU
//
std::vector<int> matmul1(
const std::vector<int>& A,
const std::vector<int>& B,
int N, int M, int P);
//
// GPU
//
std::vector<int> matmul2(
const std::vector<int>& A,
const std::vector<int>& B,
int N, int M, int P);
//
// GPU by bloc
//
std::vector<int> matmul3(
const std::vector<int>& A,
const std::vector<int>& B,
int N, int M, int P);

BIN
gpu/tp5/tp5.pdf Normal file

Binary file not shown.