notes-ing2/gpu/tp1/c/src/ex4.cu

#include <iostream>

#define RANGE(I, FROM, TO) size_t I = FROM; I < TO; I += 1

//
// example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) );
//
#define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); }
inline void cuda_check(cudaError_t code, const char *file, int line) {
    if(code != cudaSuccess) {
        std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl;
        std::abort();
    }
}


// step 06

__global__ void add_strided(int N, const int* dx, int* dy) {
    size_t threads = blockDim.x * gridDim.x;
    size_t items_per_threads = (N / threads) + 1;
    size_t base_index = (blockIdx.x * blockDim.x + threadIdx.x) * items_per_threads;
    for (RANGE(i, 0, items_per_threads)) {
        size_t index = base_index + i;
        if (index > N) continue;
        dy[index] += dx[index];
    }
}


int main()
{
    constexpr int N = 1000;
    int* x = (int*)malloc(N*sizeof(int));
    int* y = (int*)malloc(N*sizeof(int));
    for(int i = 0; i < N; ++i) {
        x[i] = i;
        y[i] = i*i;
    }

    // step 07
    int* dx;
    int* dy;
    // 1. allocate on device
    size_t size = N * sizeof(int);
    cudaMalloc(&dx, size);
    cudaMalloc(&dy, size);


    // 2. copy from host to device
    cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);


    // 3. launch CUDA kernel
    const int threads_per_bloc = 32;
    const int blocs = 8;
    add_strided<<<blocs, threads_per_bloc>>>(N, dx, dy);
    cudaDeviceSynchronize();


    // 4. copy result from device to host
    cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost);

    // 5. free device memory
    cudaFree(dx);
    cudaFree(dy);


    // checking results
    bool ok = true;
    for(int i = 0; i < N; ++i) {
        const int expected_result = i + i*i;
        if(y[i] != expected_result) {
            std::cout << "Failure" << std::endl;
            std::cout << "Result at index i="
                << i << ": expected "
                << i << '+' << i*i << '=' << expected_result << ", got " << y[i] << std::endl;
            ok = false;
            break;
        }
    }
    if(ok) std::cout << "Success" << std::endl;

    free(x);
    free(y);

    return 0;
}

// # Question 4
// Pour N tâches, X threads en tout,
// - nous devons faire en moyenne N / X tâches par threads
// - un stride valable est ceil(N / X)