diff --git a/gpu/tp1/.gitignore b/gpu/tp1/.gitignore new file mode 100644 index 0000000..bdf1172 --- /dev/null +++ b/gpu/tp1/.gitignore @@ -0,0 +1,2 @@ +bin/ +*.zip \ No newline at end of file diff --git a/gpu/tp1/c/build.sh b/gpu/tp1/c/build.sh new file mode 100755 index 0000000..bf02983 --- /dev/null +++ b/gpu/tp1/c/build.sh @@ -0,0 +1,20 @@ +#!/bin/sh +cd "$(dirname "$(realpath "$0")")" +set -e + +TARGET="ex1.cu ex2.cu ex3.cu ex4.cu" + +if [ $# -gt 0 ] +then TARGET=$1 +fi + +rm -fr bin +mkdir -p bin + +for target in $TARGET +do nvcc src/$target -o bin/${target%.cu}.out +done + +for target in $TARGET +do ./bin/${target%.cu}.out +done diff --git a/gpu/tp1/c/src/ex1.cu b/gpu/tp1/c/src/ex1.cu new file mode 100644 index 0000000..1a35094 --- /dev/null +++ b/gpu/tp1/c/src/ex1.cu @@ -0,0 +1,39 @@ +#include + +#define TO_K(X) X / 1000 +#define TO_G(X) X / 1000000000 +#define FMT_3D(X) "(" << (X)[0] << ", " << (X)[1] << ", " << (X)[2] << ")" + +int main(int argc, char const *argv[]) +{ + // step 01 + int device_count = -1; + cudaGetDeviceCount(&device_count); + + std::cout << "device_count = " << device_count << "\n"; + + for (auto i = 0; i < device_count; ++i) + { + std::cout << "device [" << i << "]:\n"; + + struct cudaDeviceProp device_prop; + cudaGetDeviceProperties(&device_prop, i); + + std::cout << "\t'device_prop.name' : " << device_prop.name << "\n"; + std::cout << "\t'device_prop.totalGlobalMem' : " << TO_G(device_prop.totalGlobalMem) << "\n"; + std::cout << "\t'device_prop.sharedMemPerBlock' : " << TO_K(device_prop.sharedMemPerBlock) << "\n"; + std::cout << "\t'device_prop.maxThreadsPerBlock' : " << device_prop.maxThreadsPerBlock << "\n"; + std::cout << "\t'device_prop.maxThreadsDim' : " << FMT_3D(device_prop.maxThreadsDim) << "\n"; + std::cout << "\t'device_prop.maxGridSize' : " << FMT_3D(device_prop.maxGridSize) << "\n"; + std::cout << "\t'(device_prop.major, device_prop.minor)' : " << device_prop.major << "." << device_prop.minor << "\n"; + std::cout << "\t'device_prop.warpSize' : " << device_prop.warpSize << "\n"; + std::cout << "\t'device_prop.regsPerBlock' : " << device_prop.regsPerBlock << "\n"; + std::cout << "\t'device_prop.multiProcessorCount' : " << device_prop.multiProcessorCount << "\n"; + + } + + return 0; +} + +// # Question 1 +// Avec 49152 octets de mémoire par bloc, il est possible de stocker 49152/4 = 12288 nombres flotants 32bit. diff --git a/gpu/tp1/c/src/ex2.cu b/gpu/tp1/c/src/ex2.cu new file mode 100644 index 0000000..f3bc555 --- /dev/null +++ b/gpu/tp1/c/src/ex2.cu @@ -0,0 +1,17 @@ +#include + +// step 02 + +__global__ void prints_hello() { + printf("Hello World bloc=%d thread=%d\n", blockIdx.x, threadIdx.x); +} + +int main() { + // step 03 + prints_hello<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; +} + +// # Question 2 +// Avec 4 blocs de 32 threads, le message apparaitra 4*32 = 128 fois. diff --git a/gpu/tp1/c/src/ex3.cu b/gpu/tp1/c/src/ex3.cu new file mode 100644 index 0000000..3496168 --- /dev/null +++ b/gpu/tp1/c/src/ex3.cu @@ -0,0 +1,90 @@ +#include + +#define RANGE(I, FROM, TO) size_t I = FROM; I < TO; I += 1 + +// +// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) ); +// +#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); } +inline void cuda_check(cudaError_t code, const char *file, int line) { + if(code != cudaSuccess) { + std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; + std::abort(); + } +} + + +// step 04 + + +__global__ void add(int N, const int* dx, int* dy) { + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index > N) return; + dy[index] += dx[index]; +} + + + + +int main() +{ + constexpr int N = 1000; + int* x = (int*)malloc(N*sizeof(int)); + int* y = (int*)malloc(N*sizeof(int)); + for(int i = 0; i < N; ++i) { + x[i] = i; + y[i] = i*i; + } + + // step 05 + int* dx; + int* dy; + + // 1. allocate on device + size_t size = N * sizeof(int); + cudaMalloc(&dx, size); + cudaMalloc(&dy, size); + + // 2. copy from host to device + cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice); + cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice); + + // 3. launch CUDA kernel + const int threads_per_bloc = 32; + add<<>>(N, dx, dy); + cudaDeviceSynchronize(); + + + // 4. copy result from device to host + cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost); + + // 5. free device memory + cudaFree(dx); + cudaFree(dy); + + // checking results + bool ok = true; + for(int i = 0; i < N; ++i) { + const int expected_result = i + i*i; + if(y[i] != expected_result) { + std::cout << "Failure" << std::endl; + std::cout << "Result at index i=" + << i << ": expected " + << i << '+' << i*i << '=' << expected_result << ", got " << y[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + free(x); + free(y); + + return 0; +} + +// # Question 3 +// Pour une suite de N tâches, avec des blocs de 32 threads +// - il faudra idéalement ceil(N/32) blocs. +// - sur le dernier bloc, N % 32 threads exécuteront une tâche +// - sur le dernier bloc, 32 - (N % 32) threads n'exécuteront aucune tâche diff --git a/gpu/tp1/c/src/ex4.cu b/gpu/tp1/c/src/ex4.cu new file mode 100644 index 0000000..d1084c1 --- /dev/null +++ b/gpu/tp1/c/src/ex4.cu @@ -0,0 +1,96 @@ +#include + +#define RANGE(I, FROM, TO) size_t I = FROM; I < TO; I += 1 + +// +// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) ); +// +#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); } +inline void cuda_check(cudaError_t code, const char *file, int line) { + if(code != cudaSuccess) { + std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; + std::abort(); + } +} + + +// step 06 + +__global__ void add_strided(int N, const int* dx, int* dy) { + size_t threads = blockDim.x * gridDim.x; + size_t items_per_threads = (N / threads) + 1; + size_t base_index = (blockIdx.x * blockDim.x + threadIdx.x) * items_per_threads; + for (RANGE(i, 0, items_per_threads)) { + size_t index = base_index + i; + if (index > N) continue; + dy[index] += dx[index]; + } +} + + + +int main() +{ + constexpr int N = 1000; + int* x = (int*)malloc(N*sizeof(int)); + int* y = (int*)malloc(N*sizeof(int)); + for(int i = 0; i < N; ++i) { + x[i] = i; + y[i] = i*i; + } + + // step 07 + int* dx; + int* dy; + // 1. allocate on device + size_t size = N * sizeof(int); + cudaMalloc(&dx, size); + cudaMalloc(&dy, size); + + + + // 2. copy from host to device + cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice); + cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice); + + + // 3. launch CUDA kernel + const int threads_per_bloc = 32; + const int blocs = 8; + add_strided<<>>(N, dx, dy); + cudaDeviceSynchronize(); + + + // 4. copy result from device to host + cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost); + + // 5. free device memory + cudaFree(dx); + cudaFree(dy); + + + // checking results + bool ok = true; + for(int i = 0; i < N; ++i) { + const int expected_result = i + i*i; + if(y[i] != expected_result) { + std::cout << "Failure" << std::endl; + std::cout << "Result at index i=" + << i << ": expected " + << i << '+' << i*i << '=' << expected_result << ", got " << y[i] << std::endl; + ok = false; + break; + } + } + if(ok) std::cout << "Success" << std::endl; + + free(x); + free(y); + + return 0; +} + +// # Question 4 +// Pour N tâches, X threads en tout, +// - nous devons faire en moyenne N / X tâches par threads +// - un stride valable est ceil(N / X)