gpu
This commit is contained in:
parent
8b3bb9c382
commit
d976cfaf74
37 changed files with 2669 additions and 371 deletions
13
gpu/tp3/.clang-format
Normal file
13
gpu/tp3/.clang-format
Normal file
|
@ -0,0 +1,13 @@
|
|||
# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json
|
||||
---
|
||||
BasedOnStyle: LLVM
|
||||
DerivePointerAlignment: false
|
||||
IndentWidth: 4
|
||||
PointerAlignment: Left
|
||||
TabWidth: 4
|
||||
UseTab: Always
|
||||
AllowShortIfStatementsOnASingleLine: AllIfsAndElse
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
ColumnLimit: 120
|
||||
AllowShortBlocksOnASingleLine: Always
|
||||
AllowShortFunctionsOnASingleLine: All
|
4
gpu/tp3/.clangd
Normal file
4
gpu/tp3/.clangd
Normal file
|
@ -0,0 +1,4 @@
|
|||
CompileFlags:
|
||||
Add:
|
||||
- -xcuda
|
||||
- --no-cuda-version-check
|
2
gpu/tp3/.gitignore
vendored
Normal file
2
gpu/tp3/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
bin/
|
||||
/*.zip
|
3
gpu/tp3/README.md
Normal file
3
gpu/tp3/README.md
Normal file
|
@ -0,0 +1,3 @@
|
|||
# TP 3
|
||||
|
||||
> `Matthieu JOLIMAITRE <matthieu.jolimaitre@epita.fr>`
|
25
gpu/tp3/c/build.sh
Executable file
25
gpu/tp3/c/build.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/bin/sh
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
set -e
|
||||
alias log="echo '[build.sh]'"
|
||||
|
||||
TARGET="ex1 ex2"
|
||||
|
||||
if [ $# -gt 0 ]
|
||||
then targets=$@
|
||||
fi
|
||||
|
||||
|
||||
rm -fr bin
|
||||
mkdir -p bin
|
||||
|
||||
ccargs="-O2"
|
||||
#ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
|
||||
|
||||
|
||||
for target in $targets
|
||||
do
|
||||
echo ""
|
||||
nvcc $ccargs -o bin/${target}.out src/${target}.cu
|
||||
./bin/${target}.out
|
||||
done
|
106
gpu/tp3/c/src/ex1.cu
Normal file
106
gpu/tp3/c/src/ex1.cu
Normal file
|
@ -0,0 +1,106 @@
|
|||
#include <cstddef>
|
||||
#include <iostream>
|
||||
|
||||
#define RANGE(I, FROM, TO) \
|
||||
size_t I = FROM; \
|
||||
I < TO; \
|
||||
I += 1
|
||||
|
||||
//
|
||||
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
||||
//
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int bloc_count = 128; // constexpr equivalent to blockDim.x in CUDA kernel
|
||||
constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x in CUDA kernel
|
||||
|
||||
constexpr int B = bloc_count;
|
||||
constexpr int T = threads_per_bloc;
|
||||
|
||||
//
|
||||
// step 01
|
||||
//
|
||||
// dx: array of size N
|
||||
// dy: array of size N
|
||||
// dz: array of size B
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
size_t from;
|
||||
size_t to;
|
||||
} StrideRange;
|
||||
#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
|
||||
|
||||
__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
|
||||
size_t block_id, size_t thread_id) {
|
||||
auto global_threads = block_dim * grid_dim;
|
||||
auto items_per_threads = (array_length / global_threads) + 1;
|
||||
auto global_thread_index = block_id * block_dim + thread_id;
|
||||
auto from = global_thread_index * items_per_threads;
|
||||
auto to = from + items_per_threads;
|
||||
return StrideRange{from, to};
|
||||
}
|
||||
|
||||
__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
|
||||
__shared__ int buffer[T];
|
||||
auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
|
||||
if (range.from >= N) return;
|
||||
buffer[threadIdx.x] = 0;
|
||||
for (RANGE(i, range.from, range.to))
|
||||
if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
|
||||
__syncthreads();
|
||||
if (threadIdx.x != 0) return;
|
||||
dz[blockIdx.x] = 0;
|
||||
for (RANGE(i, 0, T)) dz[blockIdx.x] += buffer[i];
|
||||
}
|
||||
|
||||
int main() {
|
||||
constexpr int N = 1e6;
|
||||
|
||||
int* x = (int*)malloc(N * sizeof(int));
|
||||
int* y = (int*)malloc(N * sizeof(int));
|
||||
int host_expected_result = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
x[i] = i % 10;
|
||||
y[i] = i % 3 - 1;
|
||||
host_expected_result += x[i] * y[i];
|
||||
}
|
||||
|
||||
// step 02
|
||||
int *dx, *dy, *dz;
|
||||
auto size = N * sizeof(int);
|
||||
auto res_size = B * sizeof(int);
|
||||
cudaMalloc(&dx, size);
|
||||
cudaMalloc(&dy, size);
|
||||
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
|
||||
cudaMalloc(&dz, res_size);
|
||||
|
||||
// step 03
|
||||
dot<<<B, T>>>(N, dx, dy, dz);
|
||||
int result = 0;
|
||||
int* z = (int*)malloc(res_size);
|
||||
cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
|
||||
for (RANGE(i, 0, B)) result += z[i];
|
||||
|
||||
// checking results
|
||||
if (host_expected_result == result) {
|
||||
std::cout << "Success" << std::endl;
|
||||
} else {
|
||||
std::cout << "Error" << std::endl;
|
||||
std::cout << " expected: " << host_expected_result << std::endl;
|
||||
std::cout << " got: " << result << std::endl;
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
}
|
119
gpu/tp3/c/src/ex2.cu
Normal file
119
gpu/tp3/c/src/ex2.cu
Normal file
|
@ -0,0 +1,119 @@
|
|||
#include <iostream>
|
||||
|
||||
//
|
||||
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
|
||||
//
|
||||
#define CUDA_CHECK(code) \
|
||||
{ cuda_check((code), __FILE__, __LINE__); }
|
||||
inline void cuda_check(cudaError_t code, const char* file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int bloc_count = 128; // constexpr equivalent to blockDim.x in CUDA kernel
|
||||
constexpr int threads_per_bloc = 32; // constexpr equivalent to gridDim.x in CUDA kernel
|
||||
|
||||
constexpr int B = bloc_count;
|
||||
constexpr int T = threads_per_bloc;
|
||||
|
||||
//
|
||||
// step 04
|
||||
//
|
||||
// dx: array of size N
|
||||
// dy: array of size N
|
||||
// dz: array of size B
|
||||
//
|
||||
|
||||
#define RANGE(I, FROM, TO) \
|
||||
size_t I = FROM; \
|
||||
I < TO; \
|
||||
I += 1
|
||||
|
||||
#define loop while (1)
|
||||
|
||||
typedef struct {
|
||||
size_t from;
|
||||
size_t to;
|
||||
} StrideRange;
|
||||
#define FMT_RANGE(R) "[" << R.from << "," << R.to << "]"
|
||||
|
||||
__device__ __host__ static inline StrideRange stride_range_for(size_t array_length, size_t block_dim, size_t grid_dim,
|
||||
size_t block_id, size_t thread_id) {
|
||||
auto global_threads = block_dim * grid_dim;
|
||||
auto items_per_threads = (array_length / global_threads) + 1;
|
||||
auto global_thread_index = block_id * block_dim + thread_id;
|
||||
auto from = global_thread_index * items_per_threads;
|
||||
auto to = from + items_per_threads;
|
||||
return StrideRange{from, to};
|
||||
}
|
||||
|
||||
__device__ void reduce_rec(int N, int* array) {
|
||||
auto length = N;
|
||||
auto thread_id = threadIdx.x;
|
||||
loop {
|
||||
if (length <= 1) return;
|
||||
auto half = length / 2;
|
||||
auto used_threads = half;
|
||||
if (thread_id >= used_threads) return;
|
||||
__syncthreads();
|
||||
array[thread_id] += array[thread_id + half];
|
||||
length = half;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void dot(int N, const int* dx, const int* dy, int* dz) {
|
||||
__shared__ int buffer[T];
|
||||
auto range = stride_range_for(N, blockDim.x, gridDim.x, blockIdx.x, threadIdx.x);
|
||||
if (range.from >= N) return;
|
||||
buffer[threadIdx.x] = 0;
|
||||
for (RANGE(i, range.from, range.to))
|
||||
if (i < N) buffer[threadIdx.x] += dx[i] * dy[i];
|
||||
reduce_rec(T, buffer);
|
||||
if (threadIdx.x != 0) return;
|
||||
dz[blockIdx.x] = buffer[0];
|
||||
}
|
||||
|
||||
int main() {
|
||||
constexpr int N = 1e6;
|
||||
|
||||
int* x = (int*)malloc(N * sizeof(int));
|
||||
int* y = (int*)malloc(N * sizeof(int));
|
||||
int host_expected_result = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
x[i] = i % 10;
|
||||
y[i] = i % 3 - 1;
|
||||
host_expected_result += x[i] * y[i];
|
||||
}
|
||||
|
||||
// step 05
|
||||
int result = 0;
|
||||
int *dx, *dy, *dz;
|
||||
auto size = N * sizeof(int);
|
||||
auto res_size = B * sizeof(int);
|
||||
cudaMalloc(&dx, size);
|
||||
cudaMalloc(&dy, size);
|
||||
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
|
||||
cudaMalloc(&dz, res_size);
|
||||
dot<<<B, T>>>(N, dx, dy, dz);
|
||||
int* z;
|
||||
z = (int*)malloc(res_size);
|
||||
cudaMemcpy(z, dz, res_size, cudaMemcpyDeviceToHost);
|
||||
for (RANGE(i, 0, B)) result += z[i];
|
||||
|
||||
// checking results
|
||||
if (host_expected_result == result) {
|
||||
std::cout << "Success" << std::endl;
|
||||
} else {
|
||||
std::cout << "Error" << std::endl;
|
||||
std::cout << " expected: " << host_expected_result << std::endl;
|
||||
std::cout << " got: " << result << std::endl;
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
return 0;
|
||||
}
|
BIN
gpu/tp3/tp3.pdf
Normal file
BIN
gpu/tp3/tp3.pdf
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue