diff --git a/gpu/tp2/.gitignore b/gpu/tp2/.gitignore
new file mode 100644
index 0000000..bdf1172
--- /dev/null
+++ b/gpu/tp2/.gitignore
@@ -0,0 +1,2 @@
+bin/
+*.zip
\ No newline at end of file
diff --git a/gpu/tp2/c/build.sh b/gpu/tp2/c/build.sh
new file mode 100755
index 0000000..bf02983
--- /dev/null
+++ b/gpu/tp2/c/build.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+cd "$(dirname "$(realpath "$0")")"
+set -e
+
+TARGET="ex1.cu ex2.cu ex3.cu ex4.cu"
+
+if [ $# -gt 0 ]
+then TARGET=$1
+fi
+
+rm -fr bin
+mkdir -p bin
+
+for target in $TARGET
+do nvcc src/$target -o bin/${target%.cu}.out
+done
+
+for target in $TARGET
+do ./bin/${target%.cu}.out
+done
diff --git a/gpu/tp2/c/src/Matrix.h b/gpu/tp2/c/src/Matrix.h
new file mode 100644
index 0000000..0756081
--- /dev/null
+++ b/gpu/tp2/c/src/Matrix.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char *file, int line) {
+    if(code != cudaSuccess) {
+        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
+        std::abort();
+    }
+}
+
+namespace linalg {
+
+//
+// Generic matrix of type T (int, float, double...)
+//
+template<typename T>
+class Matrix
+{
+public:
+    // construct matrix, allocate the 2D pitched memory on the device
+    __host__ Matrix(int rows, int cols);
+
+    // free allocated device memory
+    __host__ void free();
+
+public:
+    // copy values from host std::vector to device Matrix
+    // values must be a vector of size rows x cols
+    // allocation is already done in the constructor
+    __host__ void to_cuda(const std::vector<T>& values);
+
+    // copy values from device Matrix to host std::vector
+    // values may not ne resized
+    __host__ void to_cpu(std::vector<T>& values) const;
+
+public:
+    // accessor at row i and column j
+    __device__ const T& operator()(int i, int j) const;
+    __device__       T& operator()(int i, int j);
+
+public:
+    __host__ Matrix operator + (const Matrix<T>& other) const;
+    __host__ Matrix operator - (const Matrix<T>& other) const;
+    __host__ Matrix operator * (const Matrix<T>& other) const;
+    __host__ Matrix operator / (const Matrix<T>& other) const;
+
+private:
+    // apply binary functor f on all pairs of elements
+    // f must provide the following operator 
+    //
+    //     T operator()(T a, T b)
+    //
+    // template<typename BinaryFunctor>
+    // __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;
+
+public:
+    __host__ __device__ inline int rows() const {return m_rows;}
+    __host__ __device__ inline int cols() const {return m_cols;}
+
+private:
+    T* m_data_ptr; // device pointer
+    int m_rows;
+    int m_cols;
+    size_t m_pitch;
+};
+
+} // namespace linalg
+
+#include "Matrix.hpp"
\ No newline at end of file
diff --git a/gpu/tp2/c/src/Matrix.hpp b/gpu/tp2/c/src/Matrix.hpp
new file mode 100644
index 0000000..30191f2
--- /dev/null
+++ b/gpu/tp2/c/src/Matrix.hpp
@@ -0,0 +1,99 @@
+#include "Matrix.h"
+
+namespace linalg {
+
+namespace kernel {
+
+//
+// step 10
+// CUDA kernel add
+//
+
+
+
+//
+// step 12
+// CUDA kernel apply
+//
+
+
+
+
+} // namespace kernel
+
+
+template<typename T>
+__host__ Matrix<T>::Matrix(int rows, int cols) :
+    m_data_ptr(nullptr),
+    m_rows(rows),
+    m_cols(cols),
+    m_pitch(0)
+{
+    // step 07
+
+}
+
+template<typename T>
+__host__ void Matrix<T>::free()
+{
+    // step 07
+
+}
+
+template<typename T>
+__host__ void Matrix<T>::to_cuda(const std::vector<T>& values)
+{
+    // step 08
+
+}
+
+template<typename T>
+__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const
+{
+    // step 08
+
+}
+
+template<typename T>
+__device__ const T& Matrix<T>::operator()(int i, int j) const
+{
+    // step 09
+
+}
+
+template<typename T>
+__device__ T& Matrix<T>::operator()(int i, int j)
+{
+    // step 09
+
+}
+
+template<typename T>
+__host__ Matrix<T> Matrix<T>::operator + (const Matrix<T>& other) const
+{
+    // step 11
+
+}
+
+template<typename T>
+__host__ Matrix<T> Matrix<T>::operator - (const Matrix<T>& other) const
+{
+    // step 12
+
+}
+
+template<typename T>
+__host__ Matrix<T> Matrix<T>::operator * (const Matrix<T>& other) const
+{
+    // step 12
+
+}
+
+template<typename T>
+__host__ Matrix<T> Matrix<T>::operator / (const Matrix<T>& other) const
+{
+    // step 12
+
+}
+
+} // namespace linalg
diff --git a/gpu/tp2/c/src/ex1.cu b/gpu/tp2/c/src/ex1.cu
new file mode 100644
index 0000000..fbbe22a
--- /dev/null
+++ b/gpu/tp2/c/src/ex1.cu
@@ -0,0 +1,78 @@
+#include <iostream>
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char *file, int line) {
+    if(code != cudaSuccess) {
+        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
+        std::abort();
+    }
+}
+
+
+//
+// step 01
+// return the linear index corresponding to the element at row i and column j
+// in a matrix of size rows x cols, using row-major storage
+//
+__device__ int linear_index(int i, int j, int rows, int cols) {
+    
+}
+
+//
+// step 02
+// CUDA kernel add 
+//
+
+
+int main()
+{
+    constexpr int rows = 200;
+    constexpr int cols = 80;
+    int* x = (int*)malloc(rows*cols*sizeof(int));
+    int* y = (int*)malloc(rows*cols*sizeof(int));
+    for(int i = 0; i < rows*cols; ++i) {
+        x[i] = i;
+        y[i] = std::pow(-1,i) * i;
+    }
+
+    //
+    // step 03
+    //
+    int* dx;
+    int* dy;
+    // 1. allocate on device
+
+    // 2. copy from host to device
+
+    // 3. launch CUDA kernel
+    // const dim3 threads_per_bloc{32,32,1};
+
+    // 4. copy result from device to host
+
+    // 5. free device memory
+
+
+
+    // checking results
+    bool ok = true;
+    for(int i = 0; i < rows*cols; ++i) {
+        const int expected_result = std::pow(-1,i) * i + i;
+        if(y[i] != expected_result) {
+            std::cout << "Failure" << std::endl;
+            std::cout << "Result at index i=" 
+                << i << ": expected " 
+                << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
+            ok = false;
+            break;
+        }
+    }
+    if(ok) std::cout << "Success" << std::endl;
+
+    free(x);
+    free(y);
+    
+    return 0;
+}
diff --git a/gpu/tp2/c/src/ex2.cu b/gpu/tp2/c/src/ex2.cu
new file mode 100644
index 0000000..7c70e49
--- /dev/null
+++ b/gpu/tp2/c/src/ex2.cu
@@ -0,0 +1,80 @@
+#include <iostream>
+
+//
+// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
+//
+#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
+inline void cuda_check(cudaError_t code, const char *file, int line) {
+    if(code != cudaSuccess) {
+        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; 
+        std::abort();
+    }
+}
+
+//
+// step 04
+// return a pointer to the value at row i and column j from base_address 
+// with pitch in bytes
+//
+__device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) {
+    
+}
+
+//
+// step 05
+// CUDA kernel add 
+//
+
+
+
+
+int main()
+{
+    constexpr int rows = 200;
+    constexpr int cols = 80;
+    int* x = (int*)malloc(rows*cols*sizeof(int));
+    int* y = (int*)malloc(rows*cols*sizeof(int));
+    for(int i = 0; i < rows*cols; ++i) {
+        x[i] = i;
+        y[i] = std::pow(-1,i) * i;
+    }
+
+    //
+    // step 06
+    //
+    int* dx;
+    int* dy;
+    size_t pitch;
+    // 1. allocate on device
+
+    // 2. copy from host to device
+
+    // 3. launch CUDA kernel
+    // const dim3 threads_per_bloc{32,32,1};
+
+    // 4. copy result from device to host
+
+    // 5. free device memory
+
+
+
+    // checking results
+    bool ok = true;
+    for(int i = 0; i < rows*cols; ++i) {
+        const int expected_result = std::pow(-1,i) * i + i;
+        if(y[i] != expected_result) {
+            std::cout << "Failure" << std::endl;
+            std::cout << "Result at index i=" 
+                << i << ": expected " 
+                << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
+            ok = false;
+            break;
+        }
+    }
+    if(ok) std::cout << "Success" << std::endl;
+
+    free(x);
+    free(y);
+    
+    return 0;
+}
diff --git a/gpu/tp2/c/src/ex3.cu b/gpu/tp2/c/src/ex3.cu
new file mode 100644
index 0000000..30089b0
--- /dev/null
+++ b/gpu/tp2/c/src/ex3.cu
@@ -0,0 +1,148 @@
+#include "Matrix.h"
+
+int main()
+{
+    {
+        const int rows = 4;
+        const int cols = 4;
+        // instantiate two matrices of integers on the device
+        linalg::Matrix<int> A(rows, cols);
+        linalg::Matrix<int> B(rows, cols);
+        // fill the two matrices
+        A.to_cuda({ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16});
+        B.to_cuda({16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+
+        // compute the sum
+        auto C = A + B;
+
+        // transfert the result on the host
+        std::vector<int> c_res;
+        C.to_cpu(c_res);
+        C.free();
+       
+        // check results
+        const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
+        if(c_res != c_expected) {
+            std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
+            std::cout << "  expected: ";
+            for(int i : c_expected) std::cout << i << " ";
+            std::cout << std::endl;
+            std::cout << "  got:      ";
+            for(int i : c_res) std::cout << i << " ";
+            std::cout << std::endl;
+        } else {
+            std::cout << "Success" << std::endl;
+        }
+
+        // compute the difference
+        auto D = A - B;
+
+        // transfert the result on the host
+        std::vector<int> d_res;
+        D.to_cpu(d_res);
+        D.free();
+
+        // check results
+        const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15};
+        if(d_res != d_expected) {
+            std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
+            std::cout << "  expected: ";
+            for(int i : d_expected) std::cout << i << " ";
+            std::cout << std::endl;
+            std::cout << "  got:      ";
+            for(int i : d_res) std::cout << i << " ";
+            std::cout << std::endl;
+        } else {
+            std::cout << "Success" << std::endl;
+        }
+    }
+    // ------------------------------------------------------------------------
+    {
+        const int rows = 89;
+        const int cols = 128;
+        linalg::Matrix<float> A(rows, cols);
+        linalg::Matrix<float> B(rows, cols);
+        std::vector<float> a_values(rows*cols);
+        std::vector<float> b_values(rows*cols);
+        for(int i = 0; i < rows*cols; ++i) {
+            a_values[i] = 1 + float(i) / 100;
+            b_values[i] = std::pow(-1, i) * float(i)/(rows*cols) * 100;
+        }
+        A.to_cuda(a_values);
+        B.to_cuda(b_values);
+
+        auto C = A + B;
+        auto D = A - B;
+        auto E = A * B;
+        auto F = A / B;
+
+        std::vector<float> c_values;
+        C.to_cpu(c_values);
+        std::vector<float> d_values;
+        D.to_cpu(d_values);
+        std::vector<float> e_values;
+        E.to_cpu(e_values);
+        std::vector<float> f_values;
+        F.to_cpu(f_values);
+
+        C.free();
+        D.free();
+        E.free();
+        F.free();
+
+        const float epsilon = 0.001;
+        bool ok = true;
+        for(int i = 0; i < rows*cols; ++i) {
+            const float diff = std::abs( c_values[i] - (a_values[i] + b_values[i]) );
+            if(diff > epsilon) {
+                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
+                std::cout << "  expected: " << a_values[i] + b_values[i] << std::endl;
+                std::cout << "  got: "      << c_values[i] << std::endl;
+                ok = false;
+                break;
+            }
+        }
+        if(ok) std::cout << "Success" << std::endl;
+        
+        ok = true;
+        for(int i = 0; i < rows*cols; ++i) {
+            const float diff = std::abs( d_values[i] - (a_values[i] - b_values[i]) );
+            if(diff > epsilon) {
+                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
+                std::cout << "  expected: " << a_values[i] - b_values[i] << std::endl;
+                std::cout << "  got: "      << d_values[i] << std::endl;
+                ok = false;
+                break;
+            }
+        }
+        if(ok) std::cout << "Success" << std::endl;
+
+        ok = true;
+        for(int i = 0; i < rows*cols; ++i) {
+            const float diff = std::abs( e_values[i] - (a_values[i] * b_values[i]) );
+            if(diff > epsilon) {
+                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (*):" << std::endl;
+                std::cout << "  expected: " << a_values[i] * b_values[i] << std::endl;
+                std::cout << "  got: "      << e_values[i] << std::endl;
+                ok = false;
+                break;
+            }
+        }
+        if(ok) std::cout << "Success" << std::endl;
+
+        ok = true;
+        for(int i = 0; i < rows*cols; ++i) {
+            const float diff = std::abs( f_values[i] - (a_values[i] / b_values[i]) );
+            if(diff > epsilon) {
+                std::cout << __FILE__ << ":" << __LINE__ << ": Failure (/):" << std::endl;
+                std::cout << "  expected: " << a_values[i] / b_values[i] << std::endl;
+                std::cout << "  got: "      << f_values[i] << std::endl;
+                ok = false;
+                break;
+            }
+        }
+        if(ok) std::cout << "Success" << std::endl;
+    }
+
+    return 0;
+}