gpu

2024-03-28 17:58:33 +01:00 · 2024-03-28 17:58:33 +01:00 · d976cfaf74
commit d976cfaf74
parent 8b3bb9c382
37 changed files with 2669 additions and 371 deletions
--- a/gpu/tp2/.clang-format
+++ b/gpu/tp2/.clang-format
@ -0,0 +1,9 @@
+---
+BasedOnStyle: LLVM
+DerivePointerAlignment: 'false'
+IndentWidth: '4'
+PointerAlignment: Left
+TabWidth: '4'
+UseTab: Always
+
+...
--- a/gpu/tp2/.clangd
+++ b/gpu/tp2/.clangd
@ -0,0 +1,4 @@
+CompileFlags:
+  Add:
+    - -xcuda
+    - --no-cuda-version-check
--- a/gpu/tp2/.vscode/settings.json
+++ b/gpu/tp2/.vscode/settings.json
@ -0,0 +1,15 @@
+{
+    "files.associations": {
+        "iostream": "cpp",
+        "atomic": "cpp",
+        "*.tcc": "cpp",
+        "chrono": "cpp",
+        "compare": "cpp",
+        "optional": "cpp",
+        "future": "cpp",
+        "numeric": "cpp",
+        "sstream": "cpp",
+        "cmath": "cpp",
+        "*.def": "cpp"
+    }
+}
--- a/gpu/tp2/README.md
+++ b/gpu/tp2/README.md
@ -0,0 +1,14 @@
+# TP 2
+
+> `Matthieu JOLIMAITRE <matthieu.jolimaitre@epita.fr>`
+
+## Questions 
+
+1. Pour une matrice de `W × H`, l'index 'aplatis' de l'élément de coordonnées `(x, y)` est `k := (y × W) + x`.
+2. Pour une matrice de `W × H`, les coordonnées de l'élément à l'index 'aplatis' `k` sont :
+    - `x := k % W`  (Avec `%`  le reste euclidien).
+    - `y := k // W` (Avec `//` le quotient euclidien).
+3. Pour une matrice de 10 lignes de 100 colonnes, contenant des int (4 octets) ; Stocker les lignes avec un pitch devant être multiple de 128 :
+    - La longueur d'une ligne sera `100 × 4 = 400` octets.
+    - Le pitch sera le prochain produit de la table de 128 : `(400 // 128) + 1 = 512` octets.
+    - Le padding sera donc le pitch moins la longueur de la donnée : `512 - 400 = 112` octets.
--- a/gpu/tp2/c/build.sh
+++ b/gpu/tp2/c/build.sh
@ -1,20 +1,25 @@
 #!/bin/sh
 cd "$(dirname "$(realpath "$0")")"
 set -e
+alias log="echo '[build.sh]'"

-TARGET="ex1.cu ex2.cu ex3.cu ex4.cu"
+TARGET="ex1 ex2 ex3"

 if [ $# -gt 0 ]
-then TARGET=$1
+then targets=$@
 fi

+
 rm -fr bin
 mkdir -p bin

-for target in $TARGET
-do nvcc src/$target -o bin/${target%.cu}.out
-done
+ccargs=""
+#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"

-for target in $TARGET
-do ./bin/${target%.cu}.out
+
+for target in $targets
+do
+	echo ""
+	nvcc $ccargs -o bin/${target}.out src/${target}.cu
+	./bin/${target}.out
 done
--- a/gpu/tp2/c/src/Matrix.h
+++ b/gpu/tp2/c/src/Matrix.h
@ -1,14 +1,16 @@
 #pragma once

-#include <vector>
 #include <iostream>
+#include <vector>

-#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
-inline void cuda_check(cudaError_t code, const char *file, int line) {
-    if(code != cudaSuccess) {
-        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
-        std::abort();
-    }
+#define CUDA_CHECK(code)                                                       \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] "
+				  << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
 }

 namespace linalg {
@ -16,55 +18,53 @@ namespace linalg {
 //
 // Generic matrix of type T (int, float, double...)
 //
-template<typename T>
-class Matrix
-{
-public:
-    // construct matrix, allocate the 2D pitched memory on the device
-    __host__ Matrix(int rows, int cols);
+template <typename T> class Matrix {
+  public:
+	// construct matrix, allocate the 2D pitched memory on the device
+	__host__ Matrix(int rows, int cols);

-    // free allocated device memory
-    __host__ void free();
+	// free allocated device memory
+	__host__ void free();

-public:
-    // copy values from host std::vector to device Matrix
-    // values must be a vector of size rows x cols
-    // allocation is already done in the constructor
-    __host__ void to_cuda(const std::vector<T>& values);
+  public:
+	// copy values from host std::vector to device Matrix
+	// values must be a vector of size rows x cols
+	// allocation is already done in the constructor
+	__host__ void to_cuda(const std::vector<T>& values);

-    // copy values from device Matrix to host std::vector
-    // values may not ne resized
-    __host__ void to_cpu(std::vector<T>& values) const;
+	// copy values from device Matrix to host std::vector
+	// values may not ne resized
+	__host__ void to_cpu(std::vector<T>& values) const;

-public:
-    // accessor at row i and column j
-    __device__ const T& operator()(int i, int j) const;
-    __device__       T& operator()(int i, int j);
+  public:
+	// accessor at row i and column j
+	__device__ const T& operator()(int i, int j) const;
+	__device__ T& operator()(int i, int j);

-public:
-    __host__ Matrix operator + (const Matrix<T>& other) const;
-    __host__ Matrix operator - (const Matrix<T>& other) const;
-    __host__ Matrix operator * (const Matrix<T>& other) const;
-    __host__ Matrix operator / (const Matrix<T>& other) const;
+  public:
+	__host__ Matrix operator+(const Matrix<T>& other) const;
+	__host__ Matrix operator-(const Matrix<T>& other) const;
+	__host__ Matrix operator*(const Matrix<T>& other) const;
+	__host__ Matrix operator/(const Matrix<T>& other) const;

-private:
-    // apply binary functor f on all pairs of elements
-    // f must provide the following operator 
-    //
-    //     T operator()(T a, T b)
-    //
-    // template<typename BinaryFunctor>
-    // __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;
+  private:
+	// apply binary functor f on all pairs of elements
+	// f must provide the following operator
+	//
+	//     T operator()(T a, T b)
+	//
+	// template<typename BinaryFunctor>
+	// __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;

-public:
-    __host__ __device__ inline int rows() const {return m_rows;}
-    __host__ __device__ inline int cols() const {return m_cols;}
+  public:
+	__host__ __device__ inline int rows() const { return m_rows; }
+	__host__ __device__ inline int cols() const { return m_cols; }

-private:
-    T* m_data_ptr; // device pointer
-    int m_rows;
-    int m_cols;
-    size_t m_pitch;
+  private:
+	T* m_data_ptr; // device pointer
+	int m_rows;
+	int m_cols;
+	size_t m_pitch;
 };

 } // namespace linalg
--- a/gpu/tp2/c/src/Matrix.hpp
+++ b/gpu/tp2/c/src/Matrix.hpp
@ -1,5 +1,12 @@
+#pragma once
+
 #include "Matrix.h"

+#define RANGE(i, from, to)                                                     \
+	int i = from;                                                              \
+	i < to;                                                                    \
+	i += 1
+
 namespace linalg {

 namespace kernel {
@ -8,92 +15,111 @@ namespace kernel {
 // step 10
 // CUDA kernel add
 //
-
-
+template <typename T>
+__device__ void add(const Matrix<T>* a, const Matrix<T>* b, Matrix<T>* res) {
+	auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	if (x >= res->cols())
+		return;
+	if (y >= res->rows())
+		return;
+	auto a_ref = (const Matrix<T>&)(a);
+	auto b_ref = (const Matrix<T>&)(b);
+	auto res_ref = (Matrix<T>&)(res);
+	auto res_ptr = &res_ref(x, y);
+	*res_ptr = *(&a_ref(x, y)) + *b_ref(x, y);
+}

 //
 // step 12
 // CUDA kernel apply
 //

-
-
-
 } // namespace kernel

-
-template<typename T>
-__host__ Matrix<T>::Matrix(int rows, int cols) :
-    m_data_ptr(nullptr),
-    m_rows(rows),
-    m_cols(cols),
-    m_pitch(0)
-{
-    // step 07
-
+template <typename T>
+__host__ Matrix<T>::Matrix(int rows, int cols)
+	: m_data_ptr(nullptr), m_rows(rows), m_cols(cols), m_pitch(0) {
+	auto line_width = cols * sizeof(T);
+	// step 07
+	cudaMallocPitch(&this->m_data_ptr, &this->m_pitch, //
+					line_width, rows				   //
+	);
 }

-template<typename T>
-__host__ void Matrix<T>::free()
-{
-    // step 07
-
+template <typename T> __host__ void Matrix<T>::free() {
+	// step 07
+	cudaFree(this->m_data_ptr);
 }

-template<typename T>
-__host__ void Matrix<T>::to_cuda(const std::vector<T>& values)
-{
-    // step 08
-
+template <typename T>
+__host__ void Matrix<T>::to_cuda(const std::vector<T>& values) {
+	// step 08
+	auto vec_line_width = this->m_cols * sizeof(T);
+	auto vec_arr_ptr = &values.front();
+	cudaMemcpy2D(this->m_data_ptr, this->m_pitch, vec_arr_ptr, vec_line_width,
+				 vec_line_width, this->m_rows, cudaMemcpyHostToDevice);
 }

-template<typename T>
-__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const
-{
-    // step 08
-
+template <typename T>
+__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const {
+	// step 08
+	auto vec_line_width = this->m_cols * sizeof(T);
+	auto vec_arr_ptr = &values.front();
+	cudaMemcpy2D(vec_arr_ptr, vec_line_width, this->m_data_ptr, this->m_pitch,
+				 vec_line_width, this->m_rows, cudaMemcpyDeviceToHost);
 }

-template<typename T>
-__device__ const T& Matrix<T>::operator()(int i, int j) const
-{
-    // step 09
-
+template <typename T>
+__device__ const T& Matrix<T>::operator()(int i, int j) const {
+	// step 09
+	if (i >= this->m_cols)
+		return NULL;
+	if (j >= this->m_rows)
+		return NULL;
+	auto offset = (j * this->m_pitch) + (i * sizeof(T));
+	auto base_ptr = (u_int8_t*)(this->m_data_ptr);
+	auto result = base_ptr + offset;
+	return (const T&)(result);
 }

-template<typename T>
-__device__ T& Matrix<T>::operator()(int i, int j)
-{
-    // step 09
-
+template <typename T> __device__ T& Matrix<T>::operator()(int i, int j) {
+	// step 09
+	// if (i >= this->m_cols)
+	//     return nullptr;
+	// if (j >= this->m_rows)
+	//     return nullptr;
+	auto offset = (j * this->m_pitch) + (i * sizeof(T));
+	auto base_ptr = (u_int8_t*)(this->m_data_ptr);
+	auto result = base_ptr + offset;
+	return (T&)(result);
 }

-template<typename T>
-__host__ Matrix<T> Matrix<T>::operator + (const Matrix<T>& other) const
-{
-    // step 11
-
+template <typename T>
+__host__ Matrix<T> Matrix<T>::operator+(const Matrix<T>& other) const {
+	// step 11
+	auto width = min(this->m_cols, other.m_cols);
+	auto height = min(this->m_rows, other.m_rows);
+	auto res = Matrix<T>(width, height);
+	auto threads_per_block = dim3(32, 32, 1);
+	auto blocks = dim3(width / 32 + 1, height / 32 + 1, 1);
+	kernel::add<T><<<blocks, threads_per_block>>>(this, &other, &res);
+	return res;
 }

-template<typename T>
-__host__ Matrix<T> Matrix<T>::operator - (const Matrix<T>& other) const
-{
-    // step 12
-
+template <typename T>
+__host__ Matrix<T> Matrix<T>::operator-(const Matrix<T>& other) const {
+	// step 12
 }

-template<typename T>
-__host__ Matrix<T> Matrix<T>::operator * (const Matrix<T>& other) const
-{
-    // step 12
-
+template <typename T>
+__host__ Matrix<T> Matrix<T>::operator*(const Matrix<T>& other) const {
+	// step 12
 }

-template<typename T>
-__host__ Matrix<T> Matrix<T>::operator / (const Matrix<T>& other) const
-{
-    // step 12
-
+template <typename T>
+__host__ Matrix<T> Matrix<T>::operator/(const Matrix<T>& other) const {
+	// step 12
 }

 } // namespace linalg
--- a/gpu/tp2/c/src/ex1.cu
+++ b/gpu/tp2/c/src/ex1.cu
@ -3,76 +3,97 @@
 //
 // example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
 //
-#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
-inline void cuda_check(cudaError_t code, const char *file, int line) {
-    if(code != cudaSuccess) {
-        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
-        std::abort();
-    }
+#define CUDA_CHECK(code)                                                       \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] "
+				  << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
 }

-
 //
 // step 01
 // return the linear index corresponding to the element at row i and column j
 // in a matrix of size rows x cols, using row-major storage
 //
 __device__ int linear_index(int i, int j, int rows, int cols) {
-    
+	if (i >= rows)
+		return -1;
+	if (j >= cols)
+		return -1;
+	return i * cols + j;
 }

 //
 // step 02
-// CUDA kernel add 
-//
-
-
-int main()
-{
-    constexpr int rows = 200;
-    constexpr int cols = 80;
-    int* x = (int*)malloc(rows*cols*sizeof(int));
-    int* y = (int*)malloc(rows*cols*sizeof(int));
-    for(int i = 0; i < rows*cols; ++i) {
-        x[i] = i;
-        y[i] = std::pow(-1,i) * i;
-    }
-
-    //
-    // step 03
-    //
-    int* dx;
-    int* dy;
-    // 1. allocate on device
-
-    // 2. copy from host to device
-
-    // 3. launch CUDA kernel
-    // const dim3 threads_per_bloc{32,32,1};
-
-    // 4. copy result from device to host
-
-    // 5. free device memory
-
-
-
-    // checking results
-    bool ok = true;
-    for(int i = 0; i < rows*cols; ++i) {
-        const int expected_result = std::pow(-1,i) * i + i;
-        if(y[i] != expected_result) {
-            std::cout << "Failure" << std::endl;
-            std::cout << "Result at index i=" 
-                << i << ": expected " 
-                << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
-            ok = false;
-            break;
-        }
-    }
-    if(ok) std::cout << "Success" << std::endl;
-
-    free(x);
-    free(y);
-    
-    return 0;
+// CUDA kernel add
+__global__ void add(const int* dx, int* dy, int rows, int cols) {
+	auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
+	auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
+	auto index = linear_index(j, i, rows, cols);
+	if (index == -1)
+		return;
+	auto res = dx[index] + dy[index];
+	dy[index] = res;
+}
+
+int main() {
+	constexpr int rows = 200;
+	constexpr int cols = 80;
+	int* x = (int*)malloc(rows * cols * sizeof(int));
+	int* y = (int*)malloc(rows * cols * sizeof(int));
+	for (int i = 0; i < rows * cols; ++i) {
+		x[i] = i;
+		y[i] = std::pow(-1, i) * i;
+	}
+
+	//
+	// step 03
+	//
+	int* dx;
+	int* dy;
+	// 1. allocate on device
+	auto size = rows * cols * sizeof(int);
+	CUDA_CHECK(cudaMalloc(&dx, size));
+	CUDA_CHECK(cudaMalloc(&dy, size));
+
+	// 2. copy from host to device
+	CUDA_CHECK(cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice));
+
+	// 3. launch CUDA kernel
+	const dim3 threads_per_bloc{32, 32, 1};
+	auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
+	add<<<blocks, threads_per_bloc>>>(dx, dy, rows, cols);
+
+	// 4. copy result from device to host
+	CUDA_CHECK(cudaMemcpy(x, dx, size, cudaMemcpyDeviceToHost));
+	CUDA_CHECK(cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost));
+
+	// 5. free device memory
+	CUDA_CHECK(cudaFree(dx));
+	CUDA_CHECK(cudaFree(dy));
+
+	// checking results
+	bool ok = true;
+	for (int i = 0; i < rows * cols; ++i) {
+		const int expected_result = std::pow(-1, i) * i + i;
+		if (y[i] != expected_result) {
+			std::cout << "Failure" << std::endl;
+			std::cout << "Result at index i=" << i << ": expected "
+					  << std::pow(-1, i) * i << '+' << i << '='
+					  << expected_result << ", got " << y[i] << std::endl;
+			ok = false;
+			break;
+		}
+	}
+	if (ok)
+		std::cout << "Success" << std::endl;
+
+	free(x);
+	free(y);
+
+	return 0;
 }
--- a/gpu/tp2/c/src/ex2.cu
+++ b/gpu/tp2/c/src/ex2.cu
@ -3,78 +3,110 @@
 //
 // example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
 //
-#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
-inline void cuda_check(cudaError_t code, const char *file, int line) {
-    if(code != cudaSuccess) {
-        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
-        std::abort();
-    }
+#define CUDA_CHECK(code)                                                       \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] "
+				  << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
 }

+#define FMTVEC3(X) "(" << X.x << "," << X.y << "," << X.z << ")"
+
 //
 // step 04
-// return a pointer to the value at row i and column j from base_address 
+// return a pointer to the value at row i and column j from base_address
 // with pitch in bytes
 //
 __device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) {
-    
+	auto offset = i * pitch + (j * sizeof(int));
+	auto ptr = (char*)base_address;
+	return (int*)(ptr + offset);
 }

 //
 // step 05
-// CUDA kernel add 
-//
-
-
-
-
-int main()
-{
-    constexpr int rows = 200;
-    constexpr int cols = 80;
-    int* x = (int*)malloc(rows*cols*sizeof(int));
-    int* y = (int*)malloc(rows*cols*sizeof(int));
-    for(int i = 0; i < rows*cols; ++i) {
-        x[i] = i;
-        y[i] = std::pow(-1,i) * i;
-    }
-
-    //
-    // step 06
-    //
-    int* dx;
-    int* dy;
-    size_t pitch;
-    // 1. allocate on device
-
-    // 2. copy from host to device
-
-    // 3. launch CUDA kernel
-    // const dim3 threads_per_bloc{32,32,1};
-
-    // 4. copy result from device to host
-
-    // 5. free device memory
-
-
-
-    // checking results
-    bool ok = true;
-    for(int i = 0; i < rows*cols; ++i) {
-        const int expected_result = std::pow(-1,i) * i + i;
-        if(y[i] != expected_result) {
-            std::cout << "Failure" << std::endl;
-            std::cout << "Result at index i=" 
-                << i << ": expected " 
-                << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
-            ok = false;
-            break;
-        }
-    }
-    if(ok) std::cout << "Success" << std::endl;
-
-    free(x);
-    free(y);
-    
-    return 0;
+// CUDA kernel add
+__global__ void add_(int* a, int* b, size_t pitch, size_t width,
+					 size_t height) {
+	auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	if (x >= width)
+		return;
+	if (y >= height)
+		return;
+	auto ptr_a = get_ptr(a, y, x, pitch);
+	auto ptr_b = get_ptr(b, y, x, pitch);
+	auto res = *ptr_a + *ptr_b;
+	*ptr_b = res;
+}
+
+int main() {
+	constexpr int rows = 200;
+	constexpr int cols = 80;
+	int* x = (int*)malloc(rows * cols * sizeof(int));
+	int* y = (int*)malloc(rows * cols * sizeof(int));
+	for (int i = 0; i < rows * cols; ++i) {
+		x[i] = i;
+		y[i] = std::pow(-1, i) * i;
+	}
+
+	//
+	// step 06
+	//
+	int* dx;
+	int* dy;
+	size_t pitch;
+	// 1. allocate on device
+	CUDA_CHECK(cudaMallocPitch(&dx, &pitch, cols * sizeof(int), rows));
+	CUDA_CHECK(cudaMallocPitch(&dy, &pitch, cols * sizeof(int), rows));
+
+	// 2. copy from host to device
+	auto arr_width = cols * sizeof(int);
+	CUDA_CHECK(cudaMemcpy2D(dx, pitch,				  //
+							x, arr_width,			  //
+							cols * sizeof(int), rows, //
+							cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy2D(dy, pitch,				  //
+							y, arr_width,			  //
+							cols * sizeof(int), rows, //
+							cudaMemcpyHostToDevice));
+
+	// 3. launch CUDA kernel
+	const auto threads_per_bloc = dim3(32, 32, 1);
+	const auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
+	add_<<<blocks, threads_per_bloc>>>(dx, dy, pitch, cols, rows);
+
+	// 4. copy result from device to host
+	CUDA_CHECK(cudaMemcpy2D(y, arr_width,			  //
+							dy, pitch,				  //
+							cols * sizeof(int), rows, //
+							cudaMemcpyDeviceToHost));
+
+	// 5. free device memory
+	cudaFree(dx);
+	cudaFree(dy);
+
+	// checking results
+	bool ok = true;
+	for (int i = 0; i < rows * cols; ++i) {
+		const int expected_result = std::pow(-1, i) * i + i;
+		if (y[i] != expected_result) {
+			std::cout << "Failure" << std::endl;
+			std::cout << "Result at index i=" << i << ": expected "
+					  << std::pow(-1, i) * i << '+' << i << '='
+					  << expected_result << ", got " << y[i] << std::endl;
+			ok = false;
+			break;
+		}
+	}
+	if (ok)
+		std::cout << "Success" << std::endl;
+
+	free(x);
+	free(y);
+
+	return 0;
 }
--- a/gpu/tp2/c/src/ex3.cu
+++ b/gpu/tp2/c/src/ex3.cu
@ -1,148 +1,171 @@
 #include "Matrix.h"

-int main()
-{
-    {
-        const int rows = 4;
-        const int cols = 4;
-        // instantiate two matrices of integers on the device
-        linalg::Matrix<int> A(rows, cols);
-        linalg::Matrix<int> B(rows, cols);
-        // fill the two matrices
-        A.to_cuda({ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16});
-        B.to_cuda({16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+int main() {
+	{
+		const int rows = 4;
+		const int cols = 4;
+		// instantiate two matrices of integers on the device
+		linalg::Matrix<int> A(rows, cols);
+		linalg::Matrix<int> B(rows, cols);
+		// fill the two matrices
+		A.to_cuda({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+		B.to_cuda({16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});

-        // compute the sum
-        auto C = A + B;
+		// compute the sum
+		auto C = A + B;

-        // transfert the result on the host
-        std::vector<int> c_res;
-        C.to_cpu(c_res);
-        C.free();
-       
-        // check results
-        const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
-        if(c_res != c_expected) {
-            std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
-            std::cout << "  expected: ";
-            for(int i : c_expected) std::cout << i << " ";
-            std::cout << std::endl;
-            std::cout << "  got:      ";
-            for(int i : c_res) std::cout << i << " ";
-            std::cout << std::endl;
-        } else {
-            std::cout << "Success" << std::endl;
-        }
+		// transfert the result on the host
+		std::vector<int> c_res;
+		C.to_cpu(c_res);
+		C.free();

-        // compute the difference
-        auto D = A - B;
+		// check results
+		const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17,
+										  17, 17, 17, 17, 17, 17, 17, 17};
+		if (c_res != c_expected) {
+			std::cout << __FILE__ << ":" << __LINE__
+					  << ": Failure (+):" << std::endl;
+			std::cout << "  expected: ";
+			for (int i : c_expected)
+				std::cout << i << " ";
+			std::cout << std::endl;
+			std::cout << "  got:      ";
+			for (int i : c_res)
+				std::cout << i << " ";
+			std::cout << std::endl;
+		} else {
+			std::cout << "Success" << std::endl;
+		}

-        // transfert the result on the host
-        std::vector<int> d_res;
-        D.to_cpu(d_res);
-        D.free();
+		// compute the difference
+		auto D = A - B;

-        // check results
-        const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15};
-        if(d_res != d_expected) {
-            std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
-            std::cout << "  expected: ";
-            for(int i : d_expected) std::cout << i << " ";
-            std::cout << std::endl;
-            std::cout << "  got:      ";
-            for(int i : d_res) std::cout << i << " ";
-            std::cout << std::endl;
-        } else {
-            std::cout << "Success" << std::endl;
-        }
-    }
-    // ------------------------------------------------------------------------
-    {
-        const int rows = 89;
-        const int cols = 128;
-        linalg::Matrix<float> A(rows, cols);
-        linalg::Matrix<float> B(rows, cols);
-        std::vector<float> a_values(rows*cols);
-        std::vector<float> b_values(rows*cols);
-        for(int i = 0; i < rows*cols; ++i) {
-            a_values[i] = 1 + float(i) / 100;
-            b_values[i] = std::pow(-1, i) * float(i)/(rows*cols) * 100;
-        }
-        A.to_cuda(a_values);
-        B.to_cuda(b_values);
+		// transfert the result on the host
+		std::vector<int> d_res;
+		D.to_cpu(d_res);
+		D.free();

-        auto C = A + B;
-        auto D = A - B;
-        auto E = A * B;
-        auto F = A / B;
+		// check results
+		const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1,
+										  1,   3,	5,	 7,	 9,	 11, 13, 15};
+		if (d_res != d_expected) {
+			std::cout << __FILE__ << ":" << __LINE__
+					  << ": Failure (-):" << std::endl;
+			std::cout << "  expected: ";
+			for (int i : d_expected)
+				std::cout << i << " ";
+			std::cout << std::endl;
+			std::cout << "  got:      ";
+			for (int i : d_res)
+				std::cout << i << " ";
+			std::cout << std::endl;
+		} else {
+			std::cout << "Success" << std::endl;
+		}
+	}
+	// ------------------------------------------------------------------------
+	{
+		const int rows = 89;
+		const int cols = 128;
+		linalg::Matrix<float> A(rows, cols);
+		linalg::Matrix<float> B(rows, cols);
+		std::vector<float> a_values(rows * cols);
+		std::vector<float> b_values(rows * cols);
+		for (int i = 0; i < rows * cols; ++i) {
+			a_values[i] = 1 + float(i) / 100;
+			b_values[i] = std::pow(-1, i) * float(i) / (rows * cols) * 100;
+		}
+		A.to_cuda(a_values);
+		B.to_cuda(b_values);

-        std::vector<float> c_values;
-        C.to_cpu(c_values);
-        std::vector<float> d_values;
-        D.to_cpu(d_values);
-        std::vector<float> e_values;
-        E.to_cpu(e_values);
-        std::vector<float> f_values;
-        F.to_cpu(f_values);
+		auto C = A + B;
+		auto D = A - B;
+		auto E = A * B;
+		auto F = A / B;

-        C.free();
-        D.free();
-        E.free();
-        F.free();
+		std::vector<float> c_values;
+		C.to_cpu(c_values);
+		std::vector<float> d_values;
+		D.to_cpu(d_values);
+		std::vector<float> e_values;
+		E.to_cpu(e_values);
+		std::vector<float> f_values;
+		F.to_cpu(f_values);

-        const float epsilon = 0.001;
-        bool ok = true;
-        for(int i = 0; i < rows*cols; ++i) {
-            const float diff = std::abs( c_values[i] - (a_values[i] + b_values[i]) );
-            if(diff > epsilon) {
-                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
-                std::cout << "  expected: " << a_values[i] + b_values[i] << std::endl;
-                std::cout << "  got: "      << c_values[i] << std::endl;
-                ok = false;
-                break;
-            }
-        }
-        if(ok) std::cout << "Success" << std::endl;
-        
-        ok = true;
-        for(int i = 0; i < rows*cols; ++i) {
-            const float diff = std::abs( d_values[i] - (a_values[i] - b_values[i]) );
-            if(diff > epsilon) {
-                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
-                std::cout << "  expected: " << a_values[i] - b_values[i] << std::endl;
-                std::cout << "  got: "      << d_values[i] << std::endl;
-                ok = false;
-                break;
-            }
-        }
-        if(ok) std::cout << "Success" << std::endl;
+		C.free();
+		D.free();
+		E.free();
+		F.free();

-        ok = true;
-        for(int i = 0; i < rows*cols; ++i) {
-            const float diff = std::abs( e_values[i] - (a_values[i] * b_values[i]) );
-            if(diff > epsilon) {
-                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (*):" << std::endl;
-                std::cout << "  expected: " << a_values[i] * b_values[i] << std::endl;
-                std::cout << "  got: "      << e_values[i] << std::endl;
-                ok = false;
-                break;
-            }
-        }
-        if(ok) std::cout << "Success" << std::endl;
+		const float epsilon = 0.001;
+		bool ok = true;
+		for (int i = 0; i < rows * cols; ++i) {
+			const float diff =
+				std::abs(c_values[i] - (a_values[i] + b_values[i]));
+			if (diff > epsilon) {
+				std::cout << __FILE__ << ":" << __LINE__
+						  << ": Failure (+):" << std::endl;
+				std::cout << "  expected: " << a_values[i] + b_values[i]
+						  << std::endl;
+				std::cout << "  got: " << c_values[i] << std::endl;
+				ok = false;
+				break;
+			}
+		}
+		if (ok)
+			std::cout << "Success" << std::endl;

-        ok = true;
-        for(int i = 0; i < rows*cols; ++i) {
-            const float diff = std::abs( f_values[i] - (a_values[i] / b_values[i]) );
-            if(diff > epsilon) {
-                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (/):" << std::endl;
-                std::cout << "  expected: " << a_values[i] / b_values[i] << std::endl;
-                std::cout << "  got: "      << f_values[i] << std::endl;
-                ok = false;
-                break;
-            }
-        }
-        if(ok) std::cout << "Success" << std::endl;
-    }
+		ok = true;
+		for (int i = 0; i < rows * cols; ++i) {
+			const float diff =
+				std::abs(d_values[i] - (a_values[i] - b_values[i]));
+			if (diff > epsilon) {
+				std::cout << __FILE__ << ":" << __LINE__
+						  << ": Failure (-):" << std::endl;
+				std::cout << "  expected: " << a_values[i] - b_values[i]
+						  << std::endl;
+				std::cout << "  got: " << d_values[i] << std::endl;
+				ok = false;
+				break;
+			}
+		}
+		if (ok)
+			std::cout << "Success" << std::endl;

-    return 0;
+		ok = true;
+		for (int i = 0; i < rows * cols; ++i) {
+			const float diff =
+				std::abs(e_values[i] - (a_values[i] * b_values[i]));
+			if (diff > epsilon) {
+				std::cout << __FILE__ << ":" << __LINE__
+						  << ": Failure (*):" << std::endl;
+				std::cout << "  expected: " << a_values[i] * b_values[i]
+						  << std::endl;
+				std::cout << "  got: " << e_values[i] << std::endl;
+				ok = false;
+				break;
+			}
+		}
+		if (ok)
+			std::cout << "Success" << std::endl;
+
+		ok = true;
+		for (int i = 0; i < rows * cols; ++i) {
+			const float diff =
+				std::abs(f_values[i] - (a_values[i] / b_values[i]));
+			if (diff > epsilon) {
+				std::cout << __FILE__ << ":" << __LINE__
+						  << ": Failure (/):" << std::endl;
+				std::cout << "  expected: " << a_values[i] / b_values[i]
+						  << std::endl;
+				std::cout << "  got: " << f_values[i] << std::endl;
+				ok = false;
+				break;
+			}
+		}
+		if (ok)
+			std::cout << "Success" << std::endl;
+	}
+
+	return 0;
 }
--- a/gpu/tp3/.clang-format
+++ b/gpu/tp3/.clang-format
@ -0,0 +1,13 @@
+# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
+---
+BasedOnStyle: LLVM
+DerivePointerAlignment: false
+IndentWidth: 4
+PointerAlignment: Left
+TabWidth: 4
+UseTab: Always
+AllowShortIfStatementsOnASingleLine: AllIfsAndElse
+AllowShortLoopsOnASingleLine: true
+ColumnLimit: 120
+AllowShortBlocksOnASingleLine: Always
+AllowShortFunctionsOnASingleLine: All
--- a/gpu/tp3/.clangd
+++ b/gpu/tp3/.clangd
@ -0,0 +1,4 @@
+CompileFlags:
+  Add:
+    - -xcuda
+    - --no-cuda-version-check
--- a/gpu/tp3/.gitignore
+++ b/gpu/tp3/.gitignore
@ -0,0 +1,2 @@
+bin/
+/*.zip
--- a/gpu/tp3/README.md
+++ b/gpu/tp3/README.md
@ -0,0 +1,3 @@
+# TP 3
+
+> `Matthieu JOLIMAITRE <matthieu.jolimaitre@epita.fr>`
--- a/gpu/tp3/c/build.sh
+++ b/gpu/tp3/c/build.sh
@ -0,0 +1,25 @@
+#!/bin/sh
+cd "$(dirname "$(realpath "$0")")"
+set -e
+alias log="echo '[build.sh]'"
+
+TARGET="ex1 ex2"
+
+if [ $# -gt 0 ]
+then targets=$@
+fi
+
+
+rm -fr bin
+mkdir -p bin
+
+ccargs="-O2"
+#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
+
+
+for target in $targets
+do
+	echo ""
+	nvcc $ccargs -o bin/${target}.out src/${target}.cu
+	./bin/${target}.out
+done
--- a/gpu/tp3/c/src/ex1.cu
+++ b/gpu/tp3/c/src/ex1.cu
@ -0,0 +1,106 @@
+#include <cstddef>
+#include <iostream>
+
+#define RANGE(I, FROM, TO)                                                                                             \
+	size_t I = FROM;                                                                                                   \
+	I < TO;                                                                                                            \
+	I += 1
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+constexpr int bloc_count = 128;		 // constexpr equivalent to blockDim.x in CUDA kernel
+constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x  in CUDA kernel
+
+constexpr int B = bloc_count;
+constexpr int T = threads_per_bloc;
+
+//
+// step 01
+//
+// dx: array of size N
+// dy: array of size N
+// dz: array of size B
+//
+
+typedef struct {
+	size_t from;
+	size_t to;
+} StrideRange;
+#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
+
+__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
+															   size_t block_id, size_t thread_id) {
+	auto global_threads = block_dim * grid_dim;
+	auto items_per_threads = (array_length / global_threads) + 1;
+	auto global_thread_index = block_id * block_dim + thread_id;
+	auto from = global_thread_index * items_per_threads;
+	auto to = from + items_per_threads;
+	return StrideRange{from, to};
+}
+
+__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
+	__shared__ int buffer[T];
+	auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
+	if (range.from >= N) return;
+	buffer[threadIdx.x] = 0;
+	for (RANGE(i, range.from, range.to))
+		if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
+	__syncthreads();
+	if (threadIdx.x != 0) return;
+	dz[blockIdx.x] = 0;
+	for (RANGE(i, 0, T)) dz[blockIdx.x] += buffer[i];
+}
+
+int main() {
+	constexpr int N = 1e6;
+
+	int* x = (int*)malloc(N * sizeof(int));
+	int* y = (int*)malloc(N * sizeof(int));
+	int host_expected_result = 0;
+	for (int i = 0; i < N; i++) {
+		x[i] = i % 10;
+		y[i] = i % 3 - 1;
+		host_expected_result += x[i] * y[i];
+	}
+
+	// step 02
+	int *dx, *dy, *dz;
+	auto size = N * sizeof(int);
+	auto res_size = B * sizeof(int);
+	cudaMalloc(&dx, size);
+	cudaMalloc(&dy, size);
+	cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
+	cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
+	cudaMalloc(&dz, res_size);
+
+	// step 03
+	dot<<<B, T>>>(N, dx, dy, dz);
+	int result = 0;
+	int* z = (int*)malloc(res_size);
+	cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
+	for (RANGE(i, 0, B)) result += z[i];
+
+	// checking results
+	if (host_expected_result == result) {
+		std::cout << "Success" << std::endl;
+	} else {
+		std::cout << "Error" << std::endl;
+		std::cout << "  expected: " << host_expected_result << std::endl;
+		std::cout << "  got: " << result << std::endl;
+	}
+
+	free(x);
+	free(y);
+
+	return 0;
+}
--- a/gpu/tp3/c/src/ex2.cu
+++ b/gpu/tp3/c/src/ex2.cu
@ -0,0 +1,119 @@
+#include <iostream>
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+constexpr int bloc_count = 128;		 // constexpr equivalent to blockDim.x in CUDA kernel
+constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x  in CUDA kernel
+
+constexpr int B = bloc_count;
+constexpr int T = threads_per_bloc;
+
+//
+// step 04
+//
+// dx: array of size N
+// dy: array of size N
+// dz: array of size B
+//
+
+#define RANGE(I, FROM, TO)                                                                                             \
+	size_t I = FROM;                                                                                                   \
+	I < TO;                                                                                                            \
+	I += 1
+
+#define loop while (1)
+
+typedef struct {
+	size_t from;
+	size_t to;
+} StrideRange;
+#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
+
+__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
+															   size_t block_id, size_t thread_id) {
+	auto global_threads = block_dim * grid_dim;
+	auto items_per_threads = (array_length / global_threads) + 1;
+	auto global_thread_index = block_id * block_dim + thread_id;
+	auto from = global_thread_index * items_per_threads;
+	auto to = from + items_per_threads;
+	return StrideRange{from, to};
+}
+
+__device__ void reduce_rec(int N, int* array) {
+	auto length = N;
+	auto thread_id = threadIdx.x;
+	loop {
+		if (length <= 1) return;
+		auto half = length / 2;
+		auto used_threads = half;
+		if (thread_id >= used_threads) return;
+		__syncthreads();
+		array[thread_id] += array[thread_id + half];
+		length = half;
+	}
+}
+
+__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
+	__shared__ int buffer[T];
+	auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
+	if (range.from >= N) return;
+	buffer[threadIdx.x] = 0;
+	for (RANGE(i, range.from, range.to))
+		if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
+	reduce_rec(T, buffer);
+	if (threadIdx.x != 0) return;
+	dz[blockIdx.x] = buffer[0];
+}
+
+int main() {
+	constexpr int N = 1e6;
+
+	int* x = (int*)malloc(N * sizeof(int));
+	int* y = (int*)malloc(N * sizeof(int));
+	int host_expected_result = 0;
+	for (int i = 0; i < N; i++) {
+		x[i] = i % 10;
+		y[i] = i % 3 - 1;
+		host_expected_result += x[i] * y[i];
+	}
+
+	// step 05
+	int result = 0;
+	int *dx, *dy, *dz;
+	auto size = N * sizeof(int);
+	auto res_size = B * sizeof(int);
+	cudaMalloc(&dx, size);
+	cudaMalloc(&dy, size);
+	cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
+	cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
+	cudaMalloc(&dz, res_size);
+	dot<<<B, T>>>(N, dx, dy, dz);
+	int* z;
+	z = (int*)malloc(res_size);
+	cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
+	for (RANGE(i, 0, B)) result += z[i];
+
+	// checking results
+	if (host_expected_result == result) {
+		std::cout << "Success" << std::endl;
+	} else {
+		std::cout << "Error" << std::endl;
+		std::cout << "  expected: " << host_expected_result << std::endl;
+		std::cout << "  got: " << result << std::endl;
+	}
+
+	free(x);
+	free(y);
+
+	return 0;
+}
--- a/gpu/tp3/tp3.pdf
+++ b/gpu/tp3/tp3.pdf
--- a/gpu/tp4/.clang-format
+++ b/gpu/tp4/.clang-format
@ -0,0 +1,13 @@
+# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
+---
+BasedOnStyle: LLVM
+DerivePointerAlignment: false
+IndentWidth: 4
+PointerAlignment: Left
+TabWidth: 4
+UseTab: Always
+AllowShortIfStatementsOnASingleLine: AllIfsAndElse
+AllowShortLoopsOnASingleLine: true
+ColumnLimit: 120
+AllowShortBlocksOnASingleLine: Always
+AllowShortFunctionsOnASingleLine: All
--- a/gpu/tp4/.clangd
+++ b/gpu/tp4/.clangd
@ -0,0 +1,4 @@
+CompileFlags:
+  Add:
+    - -xcuda
+    - --no-cuda-version-check
--- a/gpu/tp4/.gitignore
+++ b/gpu/tp4/.gitignore
@ -0,0 +1,2 @@
+bin/
+/*.zip
--- a/gpu/tp4/README.md
+++ b/gpu/tp4/README.md
@ -0,0 +1 @@
+# 
--- a/gpu/tp4/c/build.sh
+++ b/gpu/tp4/c/build.sh
@ -0,0 +1,26 @@
+#!/bin/sh
+cd "$(dirname "$(realpath "$0")")"
+set -e
+alias log="echo '[build.sh]'"
+
+TARGET="ex1.cu ex2.cu ex3.cu"
+MODULES="conv.cu"
+
+if [ $# -gt 0 ]
+then targets="$@"
+else targets="$TARGET"
+fi
+
+rm -fr bin
+mkdir -p bin
+
+ccargs="-O2"
+#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
+
+for target in $targets
+do
+	sources="$MODULES $target"
+	inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
+	nvcc $ccargs -o bin/${target}.out $modules $inputs
+	./bin/${target}.out
+done
--- a/gpu/tp4/c/src/conv.cu
+++ b/gpu/tp4/c/src/conv.cu
@ -0,0 +1,183 @@
+#include "conv.h"
+
+constexpr int threads_per_bloc = 16;
+constexpr int T = threads_per_bloc;
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+#define RANGE(X, FROM, TO)                                                                                             \
+	long X = FROM;                                                                                                     \
+	X < TO;                                                                                                            \
+	X += 1
+
+#define THREAD_ID_X() (blockIdx.x * blockDim.x + threadIdx.x);
+
+#define DBG(X, FMT) printf(#X ": %" FMT "\n", X);
+#define DBG_S(X) #X << ": " << X << ", "
+
+//
+// 1D convolution
+// - x: input array of size N
+// - y: kernel of odd size M
+//
+// CPU
+//
+std::vector<int> conv1(const std::vector<int>& x, const std::vector<int>& y) {
+	//
+	// step 01
+	//
+	const int N = x.size();
+	const int M = y.size();
+	const int P = (M - 1) / 2;
+
+	auto z = std::vector<int>(N);
+	for (RANGE(result_index, 0, x.size())) {
+		auto result = 0;
+		for (RANGE(y_index, 0, y.size())) {
+			auto x_index = result_index - P + y_index;
+			if (x_index < 0 || x_index >= x.size()) continue;
+			result += x.at(x_index) * y.at(y_index);
+		}
+		z.at(result_index) = result;
+	}
+
+	return z;
+}
+
+namespace kernel {
+
+//
+// step 02
+//
+__global__ void conv2(const int* dx, const int* dy, int x_length, int y_length, int* dz) {
+	auto thread_id = (long)THREAD_ID_X();
+	if (thread_id >= x_length) return;
+	auto offset = (y_length - 1) / 2;
+
+	auto result_index = thread_id;
+	auto result = 0;
+	for (RANGE(y_index, 0, y_length)) {
+		auto x_index = result_index - offset + y_index;
+		if (x_index < 0 || x_index >= x_length) continue;
+		result += dx[x_index] * dy[y_index];
+	}
+	dz[result_index] = result;
+}
+
+} // namespace kernel
+
+//
+// 1D convolution
+// - x: input array of size N
+// - y: kernel of odd size M
+//
+// GPU (naive)
+//
+std::vector<int> conv2(const std::vector<int>& x, const std::vector<int>& y) {
+	//
+	// step 03
+	//
+	auto dx = (int*)nullptr;
+	auto size_dx = x.size() * sizeof(int);
+	cudaMalloc(&dx, size_dx);
+	cudaMemcpy(dx, x.data(), size_dx, cudaMemcpyHostToDevice);
+
+	auto dy = (int*)nullptr;
+	auto size_dy = y.size() * sizeof(int);
+	cudaMalloc(&dy, size_dy);
+	cudaMemcpy(dy, y.data(), size_dy, cudaMemcpyHostToDevice);
+
+	auto dz = (int*)nullptr;
+	auto size_dz = x.size() * sizeof(int);
+	cudaMalloc(&dz, size_dz);
+
+	auto blocks = x.size() / threads_per_bloc + 1;
+	kernel::conv2<<<blocks, threads_per_bloc>>>(dx, dy, x.size(), y.size(), dz);
+	cudaFree(dx);
+	cudaFree(dy);
+
+	auto z = std::vector<int>(x.size());
+	cudaMemcpy(z.data(), dz, size_dz, cudaMemcpyDeviceToHost);
+	cudaFree(dz);
+
+	return z;
+}
+
+namespace kernel {
+
+//
+// step 04
+//
+__global__ void conv3(const int* dx, const int* dy, int x_length, int y_length, int* dz) {
+	__shared__ int buffer[T];
+	auto thread_id = (long)THREAD_ID_X();
+	if (thread_id >= x_length) return;
+
+	buffer[thread_id % T] = dx[thread_id];
+	__syncthreads();
+	auto buffer_lower_x_index = (thread_id / T) * T;
+	auto buffer_upper_x_index = buffer_lower_x_index + T;
+	auto offset = (y_length - 1) / 2;
+
+	auto result_index = thread_id;
+	auto result = 0;
+	for (RANGE(y_index, 0, y_length)) {
+		auto x_index = result_index - offset + y_index;
+		if (x_index < 0 || x_index >= x_length) continue;
+		auto in_buffer = x_index >= buffer_lower_x_index && x_index < buffer_upper_x_index;
+		if (in_buffer) {
+			auto buff_index = x_index - buffer_lower_x_index;
+			result += buffer[buff_index] * dy[y_index];
+		} else result += dx[x_index] * dy[y_index];
+	}
+	dz[result_index] = result;
+}
+
+} // namespace kernel
+
+//
+// 1D convolution
+// - x: input array of size N
+// - y: kernel of odd size M
+//
+// GPU (optimized)
+//
+std::vector<int> conv3(const std::vector<int>& x, const std::vector<int>& y) {
+	//
+	// step 05
+	//
+	auto dx = (int*)nullptr;
+	auto size_dx = x.size() * sizeof(int);
+	cudaMalloc(&dx, size_dx);
+	cudaMemcpy(dx, x.data(), size_dx, cudaMemcpyHostToDevice);
+
+	auto dy = (int*)nullptr;
+	auto size_dy = y.size() * sizeof(int);
+	cudaMalloc(&dy, size_dy);
+	cudaMemcpy(dy, y.data(), size_dy, cudaMemcpyHostToDevice);
+
+	auto dz = (int*)nullptr;
+	auto size_dz = x.size() * sizeof(int);
+	cudaMalloc(&dz, size_dz);
+
+	auto blocks = x.size() / threads_per_bloc + 1;
+	kernel::conv3<<<blocks, threads_per_bloc>>>(dx, dy, x.size(), y.size(), dz);
+	cudaFree(dx);
+	cudaFree(dy);
+
+	auto z = std::vector<int>(x.size());
+	cudaMemcpy(z.data(), dz, size_dz, cudaMemcpyDeviceToHost);
+	cudaFree(dz);
+
+	return z;
+}
--- a/gpu/tp4/c/src/conv.h
+++ b/gpu/tp4/c/src/conv.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+//
+// 1D convolution 
+// - x: input array of size N
+// - y: kernel of odd size M
+//
+
+// CPU
+std::vector<int> conv1(const std::vector<int>& x, const std::vector<int>& y);
+
+// GPU (naive)
+std::vector<int> conv2(const std::vector<int>& x, const std::vector<int>& y);
+
+// GPU (optimized)
+std::vector<int> conv3(const std::vector<int>& x, const std::vector<int>& y);
--- a/gpu/tp4/c/src/ex1.cu
+++ b/gpu/tp4/c/src/ex1.cu
@ -0,0 +1,66 @@
+#include "conv.h"
+
+void print(const std::vector<int>& vec) {
+	if (vec.empty()) {
+		std::cout << "[]" << std::endl;
+	} else {
+		std::cout << "[";
+		for (size_t i = 0; i < vec.size() - 1; ++i) std::cout << vec[i] << ", ";
+		std::cout << vec.back() << "]" << std::endl;
+	}
+}
+
+int main() {
+	{
+		std::cout << "Test 1" << std::endl;
+		const auto x = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
+		const auto y = std::vector{0, 1, 0};					  // M = 3
+		const auto z_sol = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+		const auto z = conv1(x, y);
+		if (z != z_sol) {
+			std::cout << "Error, expected:" << std::endl;
+			print(z_sol);
+			std::cout << "got:" << std::endl;
+			print(z);
+		} else {
+			std::cout << "Ok" << std::endl;
+		}
+	}
+	{
+		std::cout << "Test 2" << std::endl;
+		const auto x = std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
+		const auto y = std::vector{1, 2, 4, 2, 1};				  // M = 5
+		const auto z_sol = std::vector{4, 11, 20, 30, 40, 50, 60, 70, 70, 59};
+		const auto z = conv1(x, y);
+		if (z != z_sol) {
+			std::cout << "Error, expected:" << std::endl;
+			print(z_sol);
+			std::cout << "got:" << std::endl;
+			print(z);
+		} else {
+			std::cout << "Ok" << std::endl;
+		}
+	}
+	{
+		std::cout << "Test 3" << std::endl;
+		const auto x = std::vector{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+								   18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; // N = 35
+		const auto y =
+			std::vector{1, -2, 4, -8, 16, -32, 64, -128, 256, -1024, 256, -128, 64, -32, 16, -8, 4, -2, 1}; // M = 19
+		const auto z_sol =
+			std::vector{117,	-736,	-1333,	-2058,	-2719,	-3412,	-4089,	-4774,	-5455,	-6138,	-6820,	-7502,
+						-8184,	-8866,	-9548,	-10230, -10912, -11594, -12276, -12958, -13640, -14322, -15004, -15686,
+						-16368, -17050, -17767, -18380, -19201, -19606, -20843, -20416, -23317, -19562, -29119};
+		const auto z = conv1(x, y);
+		if (z != z_sol) {
+			std::cout << "Error, expected:" << std::endl;
+			print(z_sol);
+			std::cout << "got:" << std::endl;
+			print(z);
+		} else {
+			std::cout << "Ok" << std::endl;
+		}
+	}
+
+	return 0;
+}
--- a/gpu/tp4/c/src/ex2.cu
+++ b/gpu/tp4/c/src/ex2.cu
@ -0,0 +1,76 @@
+#include "conv.h"
+
+void print(const std::vector<int>& vec) 
+{
+    if(vec.empty()) 
+    {
+        std::cout << "[]" << std::endl;
+    }
+    else
+    {
+        std::cout << "[";
+        for(size_t i = 0; i < vec.size()-1; ++i)
+            std::cout << vec[i] << ", ";
+        std::cout << vec.back() << "]" << std::endl;
+    }
+}
+
+int main()
+{
+    {
+        std::cout << "Test 1" << std::endl;
+        const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
+        const std::vector<int> y = {0, 1, 0}; // M = 3
+        const std::vector<int> z_sol = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        const std::vector<int> z = conv2(x, y);
+        if(z != z_sol)
+        {
+            std::cout << "Error, expected:" << std::endl;
+            print(z_sol);
+            std::cout << "got:" << std::endl;
+            print(z);
+        }
+        else
+        {
+            std::cout << "Ok" << std::endl;
+        }
+    }
+    {
+        std::cout << "Test 2" << std::endl;
+        const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
+        const std::vector<int> y = {1, 2, 4, 2, 1}; // M = 5
+        const std::vector<int> z_sol = {4, 11, 20, 30, 40, 50, 60, 70, 70, 59};
+        const std::vector<int> z = conv2(x, y);
+        if(z != z_sol)
+        {
+            std::cout << "Error, expected:" << std::endl;
+            print(z_sol);
+            std::cout << "got:" << std::endl;
+            print(z);
+        }
+        else
+        {
+            std::cout << "Ok" << std::endl;
+        }
+    }
+    {
+        std::cout << "Test 3" << std::endl;
+        const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; // N = 35
+        const std::vector<int> y = {1, -2, 4, -8, 16, -32, 64, -128, 256, -1024, 256, -128, 64, -32, 16, -8, 4, -2, 1}; // M = 19
+        const std::vector<int> z_sol = {117, -736, -1333, -2058, -2719, -3412, -4089, -4774, -5455, -6138, -6820, -7502, -8184, -8866, -9548, -10230, -10912, -11594, -12276, -12958, -13640, -14322, -15004, -15686, -16368, -17050, -17767, -18380, -19201, -19606, -20843, -20416, -23317, -19562, -29119};
+        const std::vector<int> z = conv2(x, y);
+        if(z != z_sol)
+        {
+            std::cout << "Error, expected:" << std::endl;
+            print(z_sol);
+            std::cout << "got:" << std::endl;
+            print(z);
+        }
+        else
+        {
+            std::cout << "Ok" << std::endl;
+        }
+    }
+
+    return 0;
+}
--- a/gpu/tp4/c/src/ex3.cu
+++ b/gpu/tp4/c/src/ex3.cu
@ -0,0 +1,76 @@
+#include "conv.h"
+
+void print(const std::vector<int>& vec) 
+{
+    if(vec.empty()) 
+    {
+        std::cout << "[]" << std::endl;
+    }
+    else
+    {
+        std::cout << "[";
+        for(size_t i = 0; i < vec.size()-1; ++i)
+            std::cout << vec[i] << ", ";
+        std::cout << vec.back() << "]" << std::endl;
+    }
+}
+
+int main()
+{
+    {
+        std::cout << "Test 1" << std::endl;
+        const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
+        const std::vector<int> y = {0, 1, 0}; // M = 3
+        const std::vector<int> z_sol = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        const std::vector<int> z = conv3(x, y);
+        if(z != z_sol)
+        {
+            std::cout << "Error, expected:" << std::endl;
+            print(z_sol);
+            std::cout << "got:" << std::endl;
+            print(z);
+        }
+        else
+        {
+            std::cout << "Ok" << std::endl;
+        }
+    }
+    {
+        std::cout << "Test 2" << std::endl;
+        const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // N = 10
+        const std::vector<int> y = {1, 2, 4, 2, 1}; // M = 5
+        const std::vector<int> z_sol = {4, 11, 20, 30, 40, 50, 60, 70, 70, 59};
+        const std::vector<int> z = conv3(x, y);
+        if(z != z_sol)
+        {
+            std::cout << "Error, expected:" << std::endl;
+            print(z_sol);
+            std::cout << "got:" << std::endl;
+            print(z);
+        }
+        else
+        {
+            std::cout << "Ok" << std::endl;
+        }
+    }
+    {
+        std::cout << "Test 3" << std::endl;
+        const std::vector<int> x = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; // N = 35
+        const std::vector<int> y = {1, -2, 4, -8, 16, -32, 64, -128, 256, -1024, 256, -128, 64, -32, 16, -8, 4, -2, 1}; // M = 19
+        const std::vector<int> z_sol = {117, -736, -1333, -2058, -2719, -3412, -4089, -4774, -5455, -6138, -6820, -7502, -8184, -8866, -9548, -10230, -10912, -11594, -12276, -12958, -13640, -14322, -15004, -15686, -16368, -17050, -17767, -18380, -19201, -19606, -20843, -20416, -23317, -19562, -29119};
+        const std::vector<int> z = conv3(x, y);
+        if(z != z_sol)
+        {
+            std::cout << "Error, expected:" << std::endl;
+            print(z_sol);
+            std::cout << "got:" << std::endl;
+            print(z);
+        }
+        else
+        {
+            std::cout << "Ok" << std::endl;
+        }
+    }
+
+    return 0;
+}
--- a/gpu/tp5/.clang-format
+++ b/gpu/tp5/.clang-format
@ -0,0 +1,13 @@
+# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
+---
+BasedOnStyle: LLVM
+DerivePointerAlignment: false
+IndentWidth: 4
+PointerAlignment: Left
+TabWidth: 4
+UseTab: Always
+AllowShortIfStatementsOnASingleLine: AllIfsAndElse
+AllowShortLoopsOnASingleLine: true
+ColumnLimit: 120
+AllowShortBlocksOnASingleLine: Always
+AllowShortFunctionsOnASingleLine: All
--- a/gpu/tp5/.clangd
+++ b/gpu/tp5/.clangd
@ -0,0 +1,4 @@
+CompileFlags:
+  Add:
+    - -xcuda
+    - --no-cuda-version-check
--- a/gpu/tp5/.gitignore
+++ b/gpu/tp5/.gitignore
@ -0,0 +1,2 @@
+bin/
+/*.zip
--- a/gpu/tp5/.vscode/settings.json
+++ b/gpu/tp5/.vscode/settings.json
@ -0,0 +1,6 @@
+{
+    "editor.formatOnType": true,
+    "[commonlisp]": {
+        "editor.wordSeparators": "`|;:'\",()"
+    }
+}
--- a/gpu/tp5/c/build.sh
+++ b/gpu/tp5/c/build.sh
@ -0,0 +1,25 @@
+#!/bin/sh
+cd "$(dirname "$(realpath "$0")")"
+set -e
+alias log="echo '[build.sh]'"
+
+TARGET="main.cu"
+MODULES="matrix.cu"
+
+if [ $# -gt 0 ]
+then targets="$@"
+else targets="$TARGET"
+fi
+
+mkdir -p bin
+
+ccargs="-O2"
+#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
+
+for target in $targets
+do
+	sources="$MODULES $target"
+	inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
+	rm -f           bin/${target}.out
+	nvcc $ccargs -o bin/${target}.out $modules $inputs
+done
--- a/gpu/tp5/c/src/main.cu
+++ b/gpu/tp5/c/src/main.cu
--- a/gpu/tp5/c/src/matrix.cu
+++ b/gpu/tp5/c/src/matrix.cu
@ -0,0 +1,198 @@
+#include "matrix.h"
+#include <cstddef>
+#include <cstdlib>
+#include <vector>
+
+#define SEP ;
+#define RANGE(I, FROM, TO) size_t I = FROM SEP I < TO SEP I += 1
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code)                                                                                               \
+	{ cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char* file, int line) {
+	if (code != cudaSuccess) {
+		std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
+		std::abort();
+	}
+}
+
+//
+// step 01
+// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j)
+//
+__host__ __device__ int index1(int i, int j, int rows, int cols) {
+	if (i < 0 || i >= rows) return -1;
+	if (j < 0 || j >= cols) return -1;
+	return (i * cols) + j;
+}
+
+template <typename T>
+__host__ __device__ inline int get_2d(const T* matrix, size_t x, size_t y, size_t width, size_t height) {
+	auto index = index1(y, x, height, width);
+	return matrix[index];
+}
+
+template <typename T>
+__host__ __device__ inline void set_2d(T* matrix, int item, size_t x, size_t y, size_t width, size_t height) {
+	auto index = index1(y, x, height, width);
+	matrix[index] = item;
+}
+
+//
+// CPU
+//
+std::vector<int> matmul1(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
+	//
+	// step 02
+	//
+
+	auto A_height = N;
+	auto A_width = M;
+	auto B_height = A_width;
+	auto B_width = P;
+
+	auto result = std::vector<int>(N * P);
+	auto result_height = A_height;
+	auto result_width = B_width;
+
+	for (RANGE(x, 0, result_width)) {
+		for (RANGE(y, 0, result_height)) {
+			auto sum = 0;
+			for (RANGE(i, 0, A_width)) {
+				auto item_a = get_2d(A.data(), i, y, A_width, A_height);
+				auto item_b = get_2d(B.data(), x, i, B_width, B_height);
+				sum += (item_a * item_b);
+			}
+			set_2d(result.data(), sum, x, y, result_width, result_height);
+		}
+	}
+
+	return result;
+}
+
+namespace kernel {
+
+#define THREAD_GID(COORD) ((blockDim.COORD * blockIdx.COORD) + threadIdx.COORD)
+
+//
+// step 03
+//
+__global__ void matmul2(const int* A, const int* B, int* C, int N, int M, int P) {
+	auto A_height = N;
+	auto A_width = M;
+	auto B_height = A_width;
+	auto B_width = P;
+
+	auto result = C;
+	auto result_height = A_height;
+	auto result_width = B_width;
+
+	auto x = THREAD_GID(x);
+	auto y = THREAD_GID(y);
+	if (x >= result_width) return;
+	if (y >= result_height) return;
+
+	auto sum = 0;
+	for (RANGE(i, 0, A_width)) {
+		auto item_a = get_2d(A, i, y, A_width, A_height);
+		auto item_b = get_2d(B, x, i, B_width, B_height);
+		sum += (item_a * item_b);
+	}
+	set_2d(result, sum, x, y, result_width, result_height);
+	return;
+}
+} // namespace kernel
+
+template <typename T> inline T* cuda_malloc(size_t item_count = 1) {
+	T* result = nullptr;
+	auto size = item_count * sizeof(T);
+	CUDA_CHECK(cudaMalloc(&result, size));
+	return result;
+}
+
+template <typename T> inline T* cuda_malloc_copy(const T* source, size_t item_count = 1) {
+	auto result = cuda_malloc<T>(item_count);
+	auto size = item_count * sizeof(T);
+	CUDA_CHECK(cudaMemcpy(result, source, size, cudaMemcpyHostToDevice));
+	return result;
+}
+
+template <typename T> inline std::vector<T> cuda_into_host(const T* allocation, size_t item_count = 1) {
+	auto size = item_count * sizeof(T);
+	auto result = std::vector<T>(item_count);
+	CUDA_CHECK(cudaMemcpy(result.data(), allocation, size, cudaMemcpyDeviceToHost));
+	return result;
+}
+
+//
+// GPU
+//
+std::vector<int> matmul2(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
+	//
+	// step 04
+	//
+	auto A_height = N;
+	auto A_width = M;
+	auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
+
+	auto B_height = A_width;
+	auto B_width = P;
+	auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
+
+	auto result_height = A_height;
+	auto result_width = B_width;
+	auto result_dev = cuda_malloc<int>(A_height * B_width);
+
+	auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
+	auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
+	kernel::matmul2<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
+
+	CUDA_CHECK(cudaFree(A_dev));
+	CUDA_CHECK(cudaFree(B_dev));
+	auto result = cuda_into_host(result_dev, result_width * result_height);
+	cudaFree(result_dev);
+
+	return result;
+}
+
+namespace kernel {
+
+//
+// step 05
+// return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
+//
+__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
+	auto local_x = j;
+	auto local_y = i;
+
+	auto local_matrix_width = T;
+	auto base_x = bj * local_matrix_width;
+	auto base_y = bi * local_matrix_width;
+
+	auto x = base_x + local_x;
+	auto y = base_y + local_y;
+	return index1(y, x, rows, cols);
+}
+
+//
+// step 06
+//
+__global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
+	auto step_count = (); //
+}
+
+} // namespace kernel
+
+//
+// GPU by bloc
+//
+std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
+	//
+	// step 07
+	//
+	std::vector<int> C(N * P);
+
+	return C;
+}
--- a/gpu/tp5/c/src/matrix.h
+++ b/gpu/tp5/c/src/matrix.h
@ -0,0 +1,31 @@
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+constexpr int     threads_per_bloc = 16;
+constexpr int T = threads_per_bloc;
+
+//
+// CPU
+//
+std::vector<int> matmul1(
+    const std::vector<int>& A,
+    const std::vector<int>& B,
+    int N, int M, int P);
+
+//
+// GPU
+//
+std::vector<int> matmul2(
+    const std::vector<int>& A,
+    const std::vector<int>& B,
+    int N, int M, int P);
+
+//
+// GPU by bloc
+//
+std::vector<int> matmul3(
+    const std::vector<int>& A,
+    const std::vector<int>& B,
+    int N, int M, int P);
--- a/gpu/tp5/tp5.pdf
+++ b/gpu/tp5/tp5.pdf