gpu
This commit is contained in:
parent
8b3bb9c382
commit
d976cfaf74
37 changed files with 2669 additions and 371 deletions
|
@ -1,20 +1,25 @@
|
|||
#!/bin/sh
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
set -e
|
||||
alias log="echo '[build.sh]'"
|
||||
|
||||
TARGET="ex1.cu ex2.cu ex3.cu ex4.cu"
|
||||
TARGET="ex1 ex2 ex3"
|
||||
|
||||
if [ $# -gt 0 ]
|
||||
then TARGET=$1
|
||||
then targets=$@
|
||||
fi
|
||||
|
||||
|
||||
rm -fr bin
|
||||
mkdir -p bin
|
||||
|
||||
for target in $TARGET
|
||||
do nvcc src/$target -o bin/${target%.cu}.out
|
||||
done
|
||||
ccargs=""
|
||||
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
|
||||
|
||||
for target in $TARGET
|
||||
do ./bin/${target%.cu}.out
|
||||
|
||||
for target in $targets
|
||||
do
|
||||
echo ""
|
||||
nvcc $ccargs -o bin/${target}.out src/${target}.cu
|
||||
./bin/${target}.out
|
||||
done
|
||||
|
|
|
@ -1,14 +1,16 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char *file, int line) {
|
||||
if(code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] "
|
||||
<< cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
namespace linalg {
|
||||
|
@ -16,55 +18,53 @@ namespace linalg {
|
|||
//
|
||||
// Generic matrix of type T (int, float, double...)
|
||||
//
|
||||
template<typename T>
|
||||
class Matrix
|
||||
{
|
||||
public:
|
||||
// construct matrix, allocate the 2D pitched memory on the device
|
||||
__host__ Matrix(int rows, int cols);
|
||||
template <typename T> class Matrix {
|
||||
public:
|
||||
// construct matrix, allocate the 2D pitched memory on the device
|
||||
__host__ Matrix(int rows, int cols);
|
||||
|
||||
// free allocated device memory
|
||||
__host__ void free();
|
||||
// free allocated device memory
|
||||
__host__ void free();
|
||||
|
||||
public:
|
||||
// copy values from host std::vector to device Matrix
|
||||
// values must be a vector of size rows x cols
|
||||
// allocation is already done in the constructor
|
||||
__host__ void to_cuda(const std::vector<T>& values);
|
||||
public:
|
||||
// copy values from host std::vector to device Matrix
|
||||
// values must be a vector of size rows x cols
|
||||
// allocation is already done in the constructor
|
||||
__host__ void to_cuda(const std::vector<T>& values);
|
||||
|
||||
// copy values from device Matrix to host std::vector
|
||||
// values may not ne resized
|
||||
__host__ void to_cpu(std::vector<T>& values) const;
|
||||
// copy values from device Matrix to host std::vector
|
||||
// values may not ne resized
|
||||
__host__ void to_cpu(std::vector<T>& values) const;
|
||||
|
||||
public:
|
||||
// accessor at row i and column j
|
||||
__device__ const T& operator()(int i, int j) const;
|
||||
__device__ T& operator()(int i, int j);
|
||||
public:
|
||||
// accessor at row i and column j
|
||||
__device__ const T& operator()(int i, int j) const;
|
||||
__device__ T& operator()(int i, int j);
|
||||
|
||||
public:
|
||||
__host__ Matrix operator + (const Matrix<T>& other) const;
|
||||
__host__ Matrix operator - (const Matrix<T>& other) const;
|
||||
__host__ Matrix operator * (const Matrix<T>& other) const;
|
||||
__host__ Matrix operator / (const Matrix<T>& other) const;
|
||||
public:
|
||||
__host__ Matrix operator+(const Matrix<T>& other) const;
|
||||
__host__ Matrix operator-(const Matrix<T>& other) const;
|
||||
__host__ Matrix operator*(const Matrix<T>& other) const;
|
||||
__host__ Matrix operator/(const Matrix<T>& other) const;
|
||||
|
||||
private:
|
||||
// apply binary functor f on all pairs of elements
|
||||
// f must provide the following operator
|
||||
//
|
||||
// T operator()(T a, T b)
|
||||
//
|
||||
// template<typename BinaryFunctor>
|
||||
// __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;
|
||||
private:
|
||||
// apply binary functor f on all pairs of elements
|
||||
// f must provide the following operator
|
||||
//
|
||||
// T operator()(T a, T b)
|
||||
//
|
||||
// template<typename BinaryFunctor>
|
||||
// __host__ Matrix apply(const Matrix<T>& other, BinaryFunctor&& f) const;
|
||||
|
||||
public:
|
||||
__host__ __device__ inline int rows() const {return m_rows;}
|
||||
__host__ __device__ inline int cols() const {return m_cols;}
|
||||
public:
|
||||
__host__ __device__ inline int rows() const { return m_rows; }
|
||||
__host__ __device__ inline int cols() const { return m_cols; }
|
||||
|
||||
private:
|
||||
T* m_data_ptr; // device pointer
|
||||
int m_rows;
|
||||
int m_cols;
|
||||
size_t m_pitch;
|
||||
private:
|
||||
T* m_data_ptr; // device pointer
|
||||
int m_rows;
|
||||
int m_cols;
|
||||
size_t m_pitch;
|
||||
};
|
||||
|
||||
} // namespace linalg
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
#include "Matrix.h"
|
||||
|
||||
#define RANGE(i, from, to) \
|
||||
int i = from; \
|
||||
i < to; \
|
||||
i += 1
|
||||
|
||||
namespace linalg {
|
||||
|
||||
namespace kernel {
|
||||
|
@ -8,92 +15,111 @@ namespace kernel {
|
|||
// step 10
|
||||
// CUDA kernel add
|
||||
//
|
||||
|
||||
|
||||
template <typename T>
|
||||
__device__ void add(const Matrix<T>* a, const Matrix<T>* b, Matrix<T>* res) {
|
||||
auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
|
||||
if (x >= res->cols())
|
||||
return;
|
||||
if (y >= res->rows())
|
||||
return;
|
||||
auto a_ref = (const Matrix<T>&)(a);
|
||||
auto b_ref = (const Matrix<T>&)(b);
|
||||
auto res_ref = (Matrix<T>&)(res);
|
||||
auto res_ptr = &res_ref(x, y);
|
||||
*res_ptr = *(&a_ref(x, y)) + *b_ref(x, y);
|
||||
}
|
||||
|
||||
//
|
||||
// step 12
|
||||
// CUDA kernel apply
|
||||
//
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace kernel
|
||||
|
||||
|
||||
template<typename T>
|
||||
__host__ Matrix<T>::Matrix(int rows, int cols) :
|
||||
m_data_ptr(nullptr),
|
||||
m_rows(rows),
|
||||
m_cols(cols),
|
||||
m_pitch(0)
|
||||
{
|
||||
// step 07
|
||||
|
||||
template <typename T>
|
||||
__host__ Matrix<T>::Matrix(int rows, int cols)
|
||||
: m_data_ptr(nullptr), m_rows(rows), m_cols(cols), m_pitch(0) {
|
||||
auto line_width = cols * sizeof(T);
|
||||
// step 07
|
||||
cudaMallocPitch(&this->m_data_ptr, &this->m_pitch, //
|
||||
line_width, rows //
|
||||
);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void Matrix<T>::free()
|
||||
{
|
||||
// step 07
|
||||
|
||||
template <typename T> __host__ void Matrix<T>::free() {
|
||||
// step 07
|
||||
cudaFree(this->m_data_ptr);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void Matrix<T>::to_cuda(const std::vector<T>& values)
|
||||
{
|
||||
// step 08
|
||||
|
||||
template <typename T>
|
||||
__host__ void Matrix<T>::to_cuda(const std::vector<T>& values) {
|
||||
// step 08
|
||||
auto vec_line_width = this->m_cols * sizeof(T);
|
||||
auto vec_arr_ptr = &values.front();
|
||||
cudaMemcpy2D(this->m_data_ptr, this->m_pitch, vec_arr_ptr, vec_line_width,
|
||||
vec_line_width, this->m_rows, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const
|
||||
{
|
||||
// step 08
|
||||
|
||||
template <typename T>
|
||||
__host__ void Matrix<T>::to_cpu(std::vector<T>& values) const {
|
||||
// step 08
|
||||
auto vec_line_width = this->m_cols * sizeof(T);
|
||||
auto vec_arr_ptr = &values.front();
|
||||
cudaMemcpy2D(vec_arr_ptr, vec_line_width, this->m_data_ptr, this->m_pitch,
|
||||
vec_line_width, this->m_rows, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ const T& Matrix<T>::operator()(int i, int j) const
|
||||
{
|
||||
// step 09
|
||||
|
||||
template <typename T>
|
||||
__device__ const T& Matrix<T>::operator()(int i, int j) const {
|
||||
// step 09
|
||||
if (i >= this->m_cols)
|
||||
return NULL;
|
||||
if (j >= this->m_rows)
|
||||
return NULL;
|
||||
auto offset = (j * this->m_pitch) + (i * sizeof(T));
|
||||
auto base_ptr = (u_int8_t*)(this->m_data_ptr);
|
||||
auto result = base_ptr + offset;
|
||||
return (const T&)(result);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ T& Matrix<T>::operator()(int i, int j)
|
||||
{
|
||||
// step 09
|
||||
|
||||
template <typename T> __device__ T& Matrix<T>::operator()(int i, int j) {
|
||||
// step 09
|
||||
// if (i >= this->m_cols)
|
||||
// return nullptr;
|
||||
// if (j >= this->m_rows)
|
||||
// return nullptr;
|
||||
auto offset = (j * this->m_pitch) + (i * sizeof(T));
|
||||
auto base_ptr = (u_int8_t*)(this->m_data_ptr);
|
||||
auto result = base_ptr + offset;
|
||||
return (T&)(result);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator + (const Matrix<T>& other) const
|
||||
{
|
||||
// step 11
|
||||
|
||||
template <typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator+(const Matrix<T>& other) const {
|
||||
// step 11
|
||||
auto width = min(this->m_cols, other.m_cols);
|
||||
auto height = min(this->m_rows, other.m_rows);
|
||||
auto res = Matrix<T>(width, height);
|
||||
auto threads_per_block = dim3(32, 32, 1);
|
||||
auto blocks = dim3(width / 32 + 1, height / 32 + 1, 1);
|
||||
kernel::add<T><<<blocks, threads_per_block>>>(this, &other, &res);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator - (const Matrix<T>& other) const
|
||||
{
|
||||
// step 12
|
||||
|
||||
template <typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator-(const Matrix<T>& other) const {
|
||||
// step 12
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator * (const Matrix<T>& other) const
|
||||
{
|
||||
// step 12
|
||||
|
||||
template <typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator*(const Matrix<T>& other) const {
|
||||
// step 12
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator / (const Matrix<T>& other) const
|
||||
{
|
||||
// step 12
|
||||
|
||||
template <typename T>
|
||||
__host__ Matrix<T> Matrix<T>::operator/(const Matrix<T>& other) const {
|
||||
// step 12
|
||||
}
|
||||
|
||||
} // namespace linalg
|
||||
|
|
|
@ -3,76 +3,97 @@
|
|||
//
|
||||
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
||||
//
|
||||
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char *file, int line) {
|
||||
if(code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] "
|
||||
<< cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// step 01
|
||||
// return the linear index corresponding to the element at row i and column j
|
||||
// in a matrix of size rows x cols, using row-major storage
|
||||
//
|
||||
__device__ int linear_index(int i, int j, int rows, int cols) {
|
||||
|
||||
if (i >= rows)
|
||||
return -1;
|
||||
if (j >= cols)
|
||||
return -1;
|
||||
return i * cols + j;
|
||||
}
|
||||
|
||||
//
|
||||
// step 02
|
||||
// CUDA kernel add
|
||||
//
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
constexpr int rows = 200;
|
||||
constexpr int cols = 80;
|
||||
int* x = (int*)malloc(rows*cols*sizeof(int));
|
||||
int* y = (int*)malloc(rows*cols*sizeof(int));
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
x[i] = i;
|
||||
y[i] = std::pow(-1,i) * i;
|
||||
}
|
||||
|
||||
//
|
||||
// step 03
|
||||
//
|
||||
int* dx;
|
||||
int* dy;
|
||||
// 1. allocate on device
|
||||
|
||||
// 2. copy from host to device
|
||||
|
||||
// 3. launch CUDA kernel
|
||||
// const dim3 threads_per_bloc{32,32,1};
|
||||
|
||||
// 4. copy result from device to host
|
||||
|
||||
// 5. free device memory
|
||||
|
||||
|
||||
|
||||
// checking results
|
||||
bool ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const int expected_result = std::pow(-1,i) * i + i;
|
||||
if(y[i] != expected_result) {
|
||||
std::cout << "Failure" << std::endl;
|
||||
std::cout << "Result at index i="
|
||||
<< i << ": expected "
|
||||
<< std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
// CUDA kernel add
|
||||
__global__ void add(const int* dx, int* dy, int rows, int cols) {
|
||||
auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
|
||||
auto index = linear_index(j, i, rows, cols);
|
||||
if (index == -1)
|
||||
return;
|
||||
auto res = dx[index] + dy[index];
|
||||
dy[index] = res;
|
||||
}
|
||||
|
||||
int main() {
|
||||
constexpr int rows = 200;
|
||||
constexpr int cols = 80;
|
||||
int* x = (int*)malloc(rows * cols * sizeof(int));
|
||||
int* y = (int*)malloc(rows * cols * sizeof(int));
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
x[i] = i;
|
||||
y[i] = std::pow(-1, i) * i;
|
||||
}
|
||||
|
||||
//
|
||||
// step 03
|
||||
//
|
||||
int* dx;
|
||||
int* dy;
|
||||
// 1. allocate on device
|
||||
auto size = rows * cols * sizeof(int);
|
||||
CUDA_CHECK(cudaMalloc(&dx, size));
|
||||
CUDA_CHECK(cudaMalloc(&dy, size));
|
||||
|
||||
// 2. copy from host to device
|
||||
CUDA_CHECK(cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice));
|
||||
|
||||
// 3. launch CUDA kernel
|
||||
const dim3 threads_per_bloc{32, 32, 1};
|
||||
auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
|
||||
add<<<blocks, threads_per_bloc>>>(dx, dy, rows, cols);
|
||||
|
||||
// 4. copy result from device to host
|
||||
CUDA_CHECK(cudaMemcpy(x, dx, size, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost));
|
||||
|
||||
// 5. free device memory
|
||||
CUDA_CHECK(cudaFree(dx));
|
||||
CUDA_CHECK(cudaFree(dy));
|
||||
|
||||
// checking results
|
||||
bool ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const int expected_result = std::pow(-1, i) * i + i;
|
||||
if (y[i] != expected_result) {
|
||||
std::cout << "Failure" << std::endl;
|
||||
std::cout << "Result at index i=" << i << ": expected "
|
||||
<< std::pow(-1, i) * i << '+' << i << '='
|
||||
<< expected_result << ", got " << y[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -3,78 +3,110 @@
|
|||
//
|
||||
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
||||
//
|
||||
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char *file, int line) {
|
||||
if(code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] "
|
||||
<< cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
#define FMTVEC3(X) "(" << X.x << "," << X.y << "," << X.z << ")"
|
||||
|
||||
//
|
||||
// step 04
|
||||
// return a pointer to the value at row i and column j from base_address
|
||||
// return a pointer to the value at row i and column j from base_address
|
||||
// with pitch in bytes
|
||||
//
|
||||
__device__ inline int* get_ptr(int* base_address, int i, int j, size_t pitch) {
|
||||
|
||||
auto offset = i * pitch + (j * sizeof(int));
|
||||
auto ptr = (char*)base_address;
|
||||
return (int*)(ptr + offset);
|
||||
}
|
||||
|
||||
//
|
||||
// step 05
|
||||
// CUDA kernel add
|
||||
//
|
||||
|
||||
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
constexpr int rows = 200;
|
||||
constexpr int cols = 80;
|
||||
int* x = (int*)malloc(rows*cols*sizeof(int));
|
||||
int* y = (int*)malloc(rows*cols*sizeof(int));
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
x[i] = i;
|
||||
y[i] = std::pow(-1,i) * i;
|
||||
}
|
||||
|
||||
//
|
||||
// step 06
|
||||
//
|
||||
int* dx;
|
||||
int* dy;
|
||||
size_t pitch;
|
||||
// 1. allocate on device
|
||||
|
||||
// 2. copy from host to device
|
||||
|
||||
// 3. launch CUDA kernel
|
||||
// const dim3 threads_per_bloc{32,32,1};
|
||||
|
||||
// 4. copy result from device to host
|
||||
|
||||
// 5. free device memory
|
||||
|
||||
|
||||
|
||||
// checking results
|
||||
bool ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const int expected_result = std::pow(-1,i) * i + i;
|
||||
if(y[i] != expected_result) {
|
||||
std::cout << "Failure" << std::endl;
|
||||
std::cout << "Result at index i="
|
||||
<< i << ": expected "
|
||||
<< std::pow(-1,i) * i << '+' << i << '=' << expected_result << ", got " << y[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
// CUDA kernel add
|
||||
__global__ void add_(int* a, int* b, size_t pitch, size_t width,
|
||||
size_t height) {
|
||||
auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
|
||||
if (x >= width)
|
||||
return;
|
||||
if (y >= height)
|
||||
return;
|
||||
auto ptr_a = get_ptr(a, y, x, pitch);
|
||||
auto ptr_b = get_ptr(b, y, x, pitch);
|
||||
auto res = *ptr_a + *ptr_b;
|
||||
*ptr_b = res;
|
||||
}
|
||||
|
||||
int main() {
|
||||
constexpr int rows = 200;
|
||||
constexpr int cols = 80;
|
||||
int* x = (int*)malloc(rows * cols * sizeof(int));
|
||||
int* y = (int*)malloc(rows * cols * sizeof(int));
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
x[i] = i;
|
||||
y[i] = std::pow(-1, i) * i;
|
||||
}
|
||||
|
||||
//
|
||||
// step 06
|
||||
//
|
||||
int* dx;
|
||||
int* dy;
|
||||
size_t pitch;
|
||||
// 1. allocate on device
|
||||
CUDA_CHECK(cudaMallocPitch(&dx, &pitch, cols * sizeof(int), rows));
|
||||
CUDA_CHECK(cudaMallocPitch(&dy, &pitch, cols * sizeof(int), rows));
|
||||
|
||||
// 2. copy from host to device
|
||||
auto arr_width = cols * sizeof(int);
|
||||
CUDA_CHECK(cudaMemcpy2D(dx, pitch, //
|
||||
x, arr_width, //
|
||||
cols * sizeof(int), rows, //
|
||||
cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy2D(dy, pitch, //
|
||||
y, arr_width, //
|
||||
cols * sizeof(int), rows, //
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
// 3. launch CUDA kernel
|
||||
const auto threads_per_bloc = dim3(32, 32, 1);
|
||||
const auto blocks = dim3(cols / 32 + 1, rows / 32 + 1, 1);
|
||||
add_<<<blocks, threads_per_bloc>>>(dx, dy, pitch, cols, rows);
|
||||
|
||||
// 4. copy result from device to host
|
||||
CUDA_CHECK(cudaMemcpy2D(y, arr_width, //
|
||||
dy, pitch, //
|
||||
cols * sizeof(int), rows, //
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
// 5. free device memory
|
||||
cudaFree(dx);
|
||||
cudaFree(dy);
|
||||
|
||||
// checking results
|
||||
bool ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const int expected_result = std::pow(-1, i) * i + i;
|
||||
if (y[i] != expected_result) {
|
||||
std::cout << "Failure" << std::endl;
|
||||
std::cout << "Result at index i=" << i << ": expected "
|
||||
<< std::pow(-1, i) * i << '+' << i << '='
|
||||
<< expected_result << ", got " << y[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,148 +1,171 @@
|
|||
#include "Matrix.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
{
|
||||
const int rows = 4;
|
||||
const int cols = 4;
|
||||
// instantiate two matrices of integers on the device
|
||||
linalg::Matrix<int> A(rows, cols);
|
||||
linalg::Matrix<int> B(rows, cols);
|
||||
// fill the two matrices
|
||||
A.to_cuda({ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16});
|
||||
B.to_cuda({16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
|
||||
int main() {
|
||||
{
|
||||
const int rows = 4;
|
||||
const int cols = 4;
|
||||
// instantiate two matrices of integers on the device
|
||||
linalg::Matrix<int> A(rows, cols);
|
||||
linalg::Matrix<int> B(rows, cols);
|
||||
// fill the two matrices
|
||||
A.to_cuda({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
|
||||
B.to_cuda({16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
|
||||
|
||||
// compute the sum
|
||||
auto C = A + B;
|
||||
// compute the sum
|
||||
auto C = A + B;
|
||||
|
||||
// transfert the result on the host
|
||||
std::vector<int> c_res;
|
||||
C.to_cpu(c_res);
|
||||
C.free();
|
||||
|
||||
// check results
|
||||
const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
|
||||
if(c_res != c_expected) {
|
||||
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
|
||||
std::cout << " expected: ";
|
||||
for(int i : c_expected) std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
std::cout << " got: ";
|
||||
for(int i : c_res) std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
} else {
|
||||
std::cout << "Success" << std::endl;
|
||||
}
|
||||
// transfert the result on the host
|
||||
std::vector<int> c_res;
|
||||
C.to_cpu(c_res);
|
||||
C.free();
|
||||
|
||||
// compute the difference
|
||||
auto D = A - B;
|
||||
// check results
|
||||
const std::vector<int> c_expected{17, 17, 17, 17, 17, 17, 17, 17,
|
||||
17, 17, 17, 17, 17, 17, 17, 17};
|
||||
if (c_res != c_expected) {
|
||||
std::cout << __FILE__ << ":" << __LINE__
|
||||
<< ": Failure (+):" << std::endl;
|
||||
std::cout << " expected: ";
|
||||
for (int i : c_expected)
|
||||
std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
std::cout << " got: ";
|
||||
for (int i : c_res)
|
||||
std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
} else {
|
||||
std::cout << "Success" << std::endl;
|
||||
}
|
||||
|
||||
// transfert the result on the host
|
||||
std::vector<int> d_res;
|
||||
D.to_cpu(d_res);
|
||||
D.free();
|
||||
// compute the difference
|
||||
auto D = A - B;
|
||||
|
||||
// check results
|
||||
const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15};
|
||||
if(d_res != d_expected) {
|
||||
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
|
||||
std::cout << " expected: ";
|
||||
for(int i : d_expected) std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
std::cout << " got: ";
|
||||
for(int i : d_res) std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
} else {
|
||||
std::cout << "Success" << std::endl;
|
||||
}
|
||||
}
|
||||
// ------------------------------------------------------------------------
|
||||
{
|
||||
const int rows = 89;
|
||||
const int cols = 128;
|
||||
linalg::Matrix<float> A(rows, cols);
|
||||
linalg::Matrix<float> B(rows, cols);
|
||||
std::vector<float> a_values(rows*cols);
|
||||
std::vector<float> b_values(rows*cols);
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
a_values[i] = 1 + float(i) / 100;
|
||||
b_values[i] = std::pow(-1, i) * float(i)/(rows*cols) * 100;
|
||||
}
|
||||
A.to_cuda(a_values);
|
||||
B.to_cuda(b_values);
|
||||
// transfert the result on the host
|
||||
std::vector<int> d_res;
|
||||
D.to_cpu(d_res);
|
||||
D.free();
|
||||
|
||||
auto C = A + B;
|
||||
auto D = A - B;
|
||||
auto E = A * B;
|
||||
auto F = A / B;
|
||||
// check results
|
||||
const std::vector<int> d_expected{-15, -13, -11, -9, -7, -5, -3, -1,
|
||||
1, 3, 5, 7, 9, 11, 13, 15};
|
||||
if (d_res != d_expected) {
|
||||
std::cout << __FILE__ << ":" << __LINE__
|
||||
<< ": Failure (-):" << std::endl;
|
||||
std::cout << " expected: ";
|
||||
for (int i : d_expected)
|
||||
std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
std::cout << " got: ";
|
||||
for (int i : d_res)
|
||||
std::cout << i << " ";
|
||||
std::cout << std::endl;
|
||||
} else {
|
||||
std::cout << "Success" << std::endl;
|
||||
}
|
||||
}
|
||||
// ------------------------------------------------------------------------
|
||||
{
|
||||
const int rows = 89;
|
||||
const int cols = 128;
|
||||
linalg::Matrix<float> A(rows, cols);
|
||||
linalg::Matrix<float> B(rows, cols);
|
||||
std::vector<float> a_values(rows * cols);
|
||||
std::vector<float> b_values(rows * cols);
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
a_values[i] = 1 + float(i) / 100;
|
||||
b_values[i] = std::pow(-1, i) * float(i) / (rows * cols) * 100;
|
||||
}
|
||||
A.to_cuda(a_values);
|
||||
B.to_cuda(b_values);
|
||||
|
||||
std::vector<float> c_values;
|
||||
C.to_cpu(c_values);
|
||||
std::vector<float> d_values;
|
||||
D.to_cpu(d_values);
|
||||
std::vector<float> e_values;
|
||||
E.to_cpu(e_values);
|
||||
std::vector<float> f_values;
|
||||
F.to_cpu(f_values);
|
||||
auto C = A + B;
|
||||
auto D = A - B;
|
||||
auto E = A * B;
|
||||
auto F = A / B;
|
||||
|
||||
C.free();
|
||||
D.free();
|
||||
E.free();
|
||||
F.free();
|
||||
std::vector<float> c_values;
|
||||
C.to_cpu(c_values);
|
||||
std::vector<float> d_values;
|
||||
D.to_cpu(d_values);
|
||||
std::vector<float> e_values;
|
||||
E.to_cpu(e_values);
|
||||
std::vector<float> f_values;
|
||||
F.to_cpu(f_values);
|
||||
|
||||
const float epsilon = 0.001;
|
||||
bool ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const float diff = std::abs( c_values[i] - (a_values[i] + b_values[i]) );
|
||||
if(diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (+):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] + b_values[i] << std::endl;
|
||||
std::cout << " got: " << c_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
|
||||
ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const float diff = std::abs( d_values[i] - (a_values[i] - b_values[i]) );
|
||||
if(diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (-):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] - b_values[i] << std::endl;
|
||||
std::cout << " got: " << d_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
C.free();
|
||||
D.free();
|
||||
E.free();
|
||||
F.free();
|
||||
|
||||
ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const float diff = std::abs( e_values[i] - (a_values[i] * b_values[i]) );
|
||||
if(diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (*):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] * b_values[i] << std::endl;
|
||||
std::cout << " got: " << e_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
const float epsilon = 0.001;
|
||||
bool ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const float diff =
|
||||
std::abs(c_values[i] - (a_values[i] + b_values[i]));
|
||||
if (diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__
|
||||
<< ": Failure (+):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] + b_values[i]
|
||||
<< std::endl;
|
||||
std::cout << " got: " << c_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
|
||||
ok = true;
|
||||
for(int i = 0; i < rows*cols; ++i) {
|
||||
const float diff = std::abs( f_values[i] - (a_values[i] / b_values[i]) );
|
||||
if(diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__ << ": Failure (/):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] / b_values[i] << std::endl;
|
||||
std::cout << " got: " << f_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(ok) std::cout << "Success" << std::endl;
|
||||
}
|
||||
ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const float diff =
|
||||
std::abs(d_values[i] - (a_values[i] - b_values[i]));
|
||||
if (diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__
|
||||
<< ": Failure (-):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] - b_values[i]
|
||||
<< std::endl;
|
||||
std::cout << " got: " << d_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
|
||||
return 0;
|
||||
ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const float diff =
|
||||
std::abs(e_values[i] - (a_values[i] * b_values[i]));
|
||||
if (diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__
|
||||
<< ": Failure (*):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] * b_values[i]
|
||||
<< std::endl;
|
||||
std::cout << " got: " << e_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
|
||||
ok = true;
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
const float diff =
|
||||
std::abs(f_values[i] - (a_values[i] / b_values[i]));
|
||||
if (diff > epsilon) {
|
||||
std::cout << __FILE__ << ":" << __LINE__
|
||||
<< ": Failure (/):" << std::endl;
|
||||
std::cout << " expected: " << a_values[i] / b_values[i]
|
||||
<< std::endl;
|
||||
std::cout << " got: " << f_values[i] << std::endl;
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
std::cout << "Success" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue