diff --git a/gpu/tp2/.gitignore b/gpu/tp2/.gitignore new file mode 100644 index 0000000..bdf1172 --- /dev/null +++ b/gpu/tp2/.gitignore @@ -0,0 +1,2 @@ +bin/ +*.zip \ No newline at end of file diff --git a/gpu/tp2/c/build.sh b/gpu/tp2/c/build.sh new file mode 100755 index 0000000..bf02983 --- /dev/null +++ b/gpu/tp2/c/build.sh @@ -0,0 +1,20 @@ +#!/bin/sh +cd "$(dirname "$(realpath "$0")")" +set -e + +TARGET="ex1.cu ex2.cu ex3.cu ex4.cu" + +if [ $# -gt 0 ] +then TARGET=$1 +fi + +rm -fr bin +mkdir -p bin + +for target in $TARGET +do nvcc src/$target -o bin/${target%.cu}.out +done + +for target in $TARGET +do ./bin/${target%.cu}.out +done diff --git a/gpu/tp2/c/src/Matrix.h b/gpu/tp2/c/src/Matrix.h new file mode 100644 index 0000000..0756081 --- /dev/null +++ b/gpu/tp2/c/src/Matrix.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + +#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); } +inline void cuda_check(cudaError_t code, const char *file, int line) { + if(code != cudaSuccess) { + std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; + std::abort(); + } +} + +namespace linalg { + +// +// Generic matrix of type T (int, float, double...) +// +template +class Matrix +{ +public: + // construct matrix, allocate the 2D pitched memory on the device + __host__ Matrix(int rows, int cols); + + // free allocated device memory + __host__ void free(); + +public: + // copy values from host std::vector to device Matrix + // values must be a vector of size rows x cols + // allocation is already done in the constructor + __host__ void to_cuda(const std::vector& values); + + // copy values from device Matrix to host std::vector + // values may not ne resized + __host__ void to_cpu(std::vector& values) const; + +public: + // accessor at row i and column j + __device__ const T& operator()(int i, int j) const; + __device__ T& operator()(int i, int j); + +public: + __host__ Matrix operator + (const Matrix& other) const; + __host__ Matrix operator - (const Matrix& other) const; + __host__ Matrix operator * (const Matrix& other) const; + __host__ Matrix operator / (const Matrix& other) const; + +private: + // apply binary functor f on all pairs of elements + // f must provide the following operator + // + // T operator()(T a, T b) + // + // template + // __host__ Matrix apply(const Matrix& other, BinaryFunctor&& f) const; + +public: + __host__ __device__ inline int rows() const {return m_rows;} + __host__ __device__ inline int cols() const {return m_cols;} + +private: + T* m_data_ptr; // device pointer + int m_rows; + int m_cols; + size_t m_pitch; +}; + +} // namespace linalg + +#include "Matrix.hpp" \ No newline at end of file diff --git a/gpu/tp2/c/src/Matrix.hpp b/gpu/tp2/c/src/Matrix.hpp new file mode 100644 index 0000000..30191f2 --- /dev/null +++ b/gpu/tp2/c/src/Matrix.hpp @@ -0,0 +1,99 @@ +#include "Matrix.h" + +namespace linalg { + +namespace kernel { + +// +// step 10 +// CUDA kernel add +// + + + +// +// step 12 +// CUDA kernel apply +// + + + + +} // namespace kernel + + +template +__host__ Matrix::Matrix(int rows, int cols) : + m_data_ptr(nullptr), + m_rows(rows), + m_cols(cols), + m_pitch(0) +{ + // step 07 + +} + +template +__host__ void Matrix::free() +{ + // step 07 + +} + +template +__host__ void Matrix::to_cuda(const std::vector& values) +{ + // step 08 + +} + +template +__host__ void Matrix::to_cpu(std::vector& values) const +{ + // step 08 + +} + +template +__device__ const T& Matrix::operator()(int i, int j) const +{ + // step 09 + +} + +template +__device__ T& Matrix::operator()(int i, int j) +{ + // step 09 + +} + +template +__host__ Matrix Matrix::operator + (const Matrix& other) const +{ + // step 11 + +} + +template +__host__ Matrix Matrix::operator - (const Matrix& other) const +{ + // step 12 + +} + +template +__host__ Matrix Matrix::operator * (const Matrix& other) const +{ + // step 12 + +} + +template +__host__ Matrix Matrix::operator / (const Matrix& other) const +{ + // step 12 + +} + +} // namespace linalg diff --git a/gpu/tp2/c/src/ex1.cu b/gpu/tp2/c/src/ex1.cu new file mode 100644 index 0000000..fbbe22a --- /dev/null +++ b/gpu/tp2/c/src/ex1.cu @@ -0,0 +1,78 @@ +#include + +// +// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) ); +// +#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); } +inline void cuda_check(cudaError_t code, const char *file, int line) { + if(code != cudaSuccess) { + std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; + std::abort(); + } +} + + +// +// step 01 +// return the linear index corresponding to the element at row i and column j +// in a matrix of size rows x cols, using row-major storage +// +__device__ int linear_index(int i, int j, int rows, int cols) { + +} + +// +// step 02 +// CUDA kernel add +// + + +int main() +{ + constexpr int rows = 200; + constexpr int cols = 80; + int* x = (int*)malloc(rows*cols*sizeof(int)); + int* y = (int*)malloc(rows*cols*sizeof(int)); + for(int i = 0; i < rows*cols; ++i) { + x[i] = i; + y[i] = std::pow(-1,i) * i; + } + + // + // step 03 + // + int* dx; + int* dy; + // 1. allocate on device + + // 2. copy from host to device + + // 3. launch CUDA kernel + // const dim3 threads_per_bloc{32,32,1}; + + // 4. copy result from device to host + + // 5. free device memory + + + + // checking results + bool ok = true; + for(int i = 0; i < rows*cols; ++i) { + const int expected_result = std::pow(-1,i) * i + i; + if(y[i] != expected_result) { + std::cout << "Failure" << std::endl; + std::cout << "Result at index i=" + << i << ": expected " + << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + free(x); + free(y); + + return 0; +} diff --git a/gpu/tp2/c/src/ex2.cu b/gpu/tp2/c/src/ex2.cu new file mode 100644 index 0000000..7c70e49 --- /dev/null +++ b/gpu/tp2/c/src/ex2.cu @@ -0,0 +1,80 @@ +#include + +// +// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) ); +// +#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); } +inline void cuda_check(cudaError_t code, const char *file, int line) { + if(code != cudaSuccess) { + std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; + std::abort(); + } +} + +// +// step 04 +// return a pointer to the value at row i and column j from base_address +// with pitch in bytes +// +__device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) { + +} + +// +// step 05 +// CUDA kernel add +// + + + + +int main() +{ + constexpr int rows = 200; + constexpr int cols = 80; + int* x = (int*)malloc(rows*cols*sizeof(int)); + int* y = (int*)malloc(rows*cols*sizeof(int)); + for(int i = 0; i < rows*cols; ++i) { + x[i] = i; + y[i] = std::pow(-1,i) * i; + } + + // + // step 06 + // + int* dx; + int* dy; + size_t pitch; + // 1. allocate on device + + // 2. copy from host to device + + // 3. launch CUDA kernel + // const dim3 threads_per_bloc{32,32,1}; + + // 4. copy result from device to host + + // 5. free device memory + + + + // checking results + bool ok = true; + for(int i = 0; i < rows*cols; ++i) { + const int expected_result = std::pow(-1,i) * i + i; + if(y[i] != expected_result) { + std::cout << "Failure" << std::endl; + std::cout << "Result at index i=" + << i << ": expected " + << std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + free(x); + free(y); + + return 0; +} diff --git a/gpu/tp2/c/src/ex3.cu b/gpu/tp2/c/src/ex3.cu new file mode 100644 index 0000000..30089b0 --- /dev/null +++ b/gpu/tp2/c/src/ex3.cu @@ -0,0 +1,148 @@ +#include "Matrix.h" + +int main() +{ + { + const int rows = 4; + const int cols = 4; + // instantiate two matrices of integers on the device + linalg::Matrix A(rows, cols); + linalg::Matrix B(rows, cols); + // fill the two matrices + A.to_cuda({ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16}); + B.to_cuda({16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1}); + + // compute the sum + auto C = A + B; + + // transfert the result on the host + std::vector c_res; + C.to_cpu(c_res); + C.free(); + + // check results + const std::vector c_expected{17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; + if(c_res != c_expected) { + std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl; + std::cout << " expected: "; + for(int i : c_expected) std::cout << i << " "; + std::cout << std::endl; + std::cout << " got: "; + for(int i : c_res) std::cout << i << " "; + std::cout << std::endl; + } else { + std::cout << "Success" << std::endl; + } + + // compute the difference + auto D = A - B; + + // transfert the result on the host + std::vector d_res; + D.to_cpu(d_res); + D.free(); + + // check results + const std::vector d_expected{-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15}; + if(d_res != d_expected) { + std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl; + std::cout << " expected: "; + for(int i : d_expected) std::cout << i << " "; + std::cout << std::endl; + std::cout << " got: "; + for(int i : d_res) std::cout << i << " "; + std::cout << std::endl; + } else { + std::cout << "Success" << std::endl; + } + } + // ------------------------------------------------------------------------ + { + const int rows = 89; + const int cols = 128; + linalg::Matrix A(rows, cols); + linalg::Matrix B(rows, cols); + std::vector a_values(rows*cols); + std::vector b_values(rows*cols); + for(int i = 0; i < rows*cols; ++i) { + a_values[i] = 1 + float(i) / 100; + b_values[i] = std::pow(-1, i) * float(i)/(rows*cols) * 100; + } + A.to_cuda(a_values); + B.to_cuda(b_values); + + auto C = A + B; + auto D = A - B; + auto E = A * B; + auto F = A / B; + + std::vector c_values; + C.to_cpu(c_values); + std::vector d_values; + D.to_cpu(d_values); + std::vector e_values; + E.to_cpu(e_values); + std::vector f_values; + F.to_cpu(f_values); + + C.free(); + D.free(); + E.free(); + F.free(); + + const float epsilon = 0.001; + bool ok = true; + for(int i = 0; i < rows*cols; ++i) { + const float diff = std::abs( c_values[i] - (a_values[i] + b_values[i]) ); + if(diff > epsilon) { + std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl; + std::cout << " expected: " << a_values[i] + b_values[i] << std::endl; + std::cout << " got: " << c_values[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + ok = true; + for(int i = 0; i < rows*cols; ++i) { + const float diff = std::abs( d_values[i] - (a_values[i] - b_values[i]) ); + if(diff > epsilon) { + std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl; + std::cout << " expected: " << a_values[i] - b_values[i] << std::endl; + std::cout << " got: " << d_values[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + ok = true; + for(int i = 0; i < rows*cols; ++i) { + const float diff = std::abs( e_values[i] - (a_values[i] * b_values[i]) ); + if(diff > epsilon) { + std::cout << __FILE__ << ":" << __LINE__ << ": Failure (*):" << std::endl; + std::cout << " expected: " << a_values[i] * b_values[i] << std::endl; + std::cout << " got: " << e_values[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + ok = true; + for(int i = 0; i < rows*cols; ++i) { + const float diff = std::abs( f_values[i] - (a_values[i] / b_values[i]) ); + if(diff > epsilon) { + std::cout << __FILE__ << ":" << __LINE__ << ": Failure (/):" << std::endl; + std::cout << " expected: " << a_values[i] / b_values[i] << std::endl; + std::cout << " got: " << f_values[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + } + + return 0; +}