This commit is contained in:
JOLIMAITRE Matthieu 2024-03-28 17:58:33 +01:00
parent 8b3bb9c382
commit d976cfaf74
37 changed files with 2669 additions and 371 deletions

106
gpu/tp3/c/src/ex1.cu Normal file
View file

@ -0,0 +1,106 @@
#include <cstddef>
#include <iostream>
#define RANGE(I, FROM, TO) \
size_t I = FROM; \
I < TO; \
I += 1
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
constexpr int bloc_count = 128; // constexpr equivalent to blockDim.x in CUDA kernel
constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x in CUDA kernel
constexpr int B = bloc_count;
constexpr int T = threads_per_bloc;
//
// step 01
//
// dx: array of size N
// dy: array of size N
// dz: array of size B
//
typedef struct {
size_t from;
size_t to;
} StrideRange;
#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
size_t block_id, size_t thread_id) {
auto global_threads = block_dim * grid_dim;
auto items_per_threads = (array_length / global_threads) + 1;
auto global_thread_index = block_id * block_dim + thread_id;
auto from = global_thread_index * items_per_threads;
auto to = from + items_per_threads;
return StrideRange{from, to};
}
__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
__shared__ int buffer[T];
auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
if (range.from >= N) return;
buffer[threadIdx.x] = 0;
for (RANGE(i, range.from, range.to))
if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
__syncthreads();
if (threadIdx.x != 0) return;
dz[blockIdx.x] = 0;
for (RANGE(i, 0, T)) dz[blockIdx.x] += buffer[i];
}
int main() {
constexpr int N = 1e6;
int* x = (int*)malloc(N * sizeof(int));
int* y = (int*)malloc(N * sizeof(int));
int host_expected_result = 0;
for (int i = 0; i < N; i++) {
x[i] = i % 10;
y[i] = i % 3 - 1;
host_expected_result += x[i] * y[i];
}
// step 02
int *dx, *dy, *dz;
auto size = N * sizeof(int);
auto res_size = B * sizeof(int);
cudaMalloc(&dx, size);
cudaMalloc(&dy, size);
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
cudaMalloc(&dz, res_size);
// step 03
dot<<<B, T>>>(N, dx, dy, dz);
int result = 0;
int* z = (int*)malloc(res_size);
cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
for (RANGE(i, 0, B)) result += z[i];
// checking results
if (host_expected_result == result) {
std::cout << "Success" << std::endl;
} else {
std::cout << "Error" << std::endl;
std::cout << " expected: " << host_expected_result << std::endl;
std::cout << " got: " << result << std::endl;
}
free(x);
free(y);
return 0;
}

119
gpu/tp3/c/src/ex2.cu Normal file
View file

@ -0,0 +1,119 @@
#include <iostream>
//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) \
{ cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
std::abort();
}
}
constexpr int bloc_count = 128; // constexpr equivalent to blockDim.x in CUDA kernel
constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x in CUDA kernel
constexpr int B = bloc_count;
constexpr int T = threads_per_bloc;
//
// step 04
//
// dx: array of size N
// dy: array of size N
// dz: array of size B
//
#define RANGE(I, FROM, TO) \
size_t I = FROM; \
I < TO; \
I += 1
#define loop while (1)
typedef struct {
size_t from;
size_t to;
} StrideRange;
#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
size_t block_id, size_t thread_id) {
auto global_threads = block_dim * grid_dim;
auto items_per_threads = (array_length / global_threads) + 1;
auto global_thread_index = block_id * block_dim + thread_id;
auto from = global_thread_index * items_per_threads;
auto to = from + items_per_threads;
return StrideRange{from, to};
}
__device__ void reduce_rec(int N, int* array) {
auto length = N;
auto thread_id = threadIdx.x;
loop {
if (length <= 1) return;
auto half = length / 2;
auto used_threads = half;
if (thread_id >= used_threads) return;
__syncthreads();
array[thread_id] += array[thread_id + half];
length = half;
}
}
__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
__shared__ int buffer[T];
auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
if (range.from >= N) return;
buffer[threadIdx.x] = 0;
for (RANGE(i, range.from, range.to))
if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
reduce_rec(T, buffer);
if (threadIdx.x != 0) return;
dz[blockIdx.x] = buffer[0];
}
int main() {
constexpr int N = 1e6;
int* x = (int*)malloc(N * sizeof(int));
int* y = (int*)malloc(N * sizeof(int));
int host_expected_result = 0;
for (int i = 0; i < N; i++) {
x[i] = i % 10;
y[i] = i % 3 - 1;
host_expected_result += x[i] * y[i];
}
// step 05
int result = 0;
int *dx, *dy, *dz;
auto size = N * sizeof(int);
auto res_size = B * sizeof(int);
cudaMalloc(&dx, size);
cudaMalloc(&dy, size);
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
cudaMalloc(&dz, res_size);
dot<<<B, T>>>(N, dx, dy, dz);
int* z;
z = (int*)malloc(res_size);
cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
for (RANGE(i, 0, B)) result += z[i];
// checking results
if (host_expected_result == result) {
std::cout << "Success" << std::endl;
} else {
std::cout << "Error" << std::endl;
std::cout << " expected: " << host_expected_result << std::endl;
std::cout << " got: " << result << std::endl;
}
free(x);
free(y);
return 0;
}