#include #define RANGE(I, FROM, TO) size_t I = FROM; I < TO; I += 1 // // example: CUDA_CHECK( cudaMalloc(dx, x, N*sizeof(int) ); // #define CUDA_CHECK(code) { cuda_check((code), __FILE__, __LINE__); } inline void cuda_check(cudaError_t code, const char *file, int line) { if(code != cudaSuccess) { std::cout << file << ':' << line << ": [CUDA ERROR] " << cudaGetErrorString(code) << std::endl; std::abort(); } } // step 06 __global__ void add_strided(int N, const int* dx, int* dy) { size_t threads = blockDim.x * gridDim.x; size_t items_per_threads = (N / threads) + 1; size_t base_index = (blockIdx.x * blockDim.x + threadIdx.x) * items_per_threads; for (RANGE(i, 0, items_per_threads)) { size_t index = base_index + i; if (index > N) continue; dy[index] += dx[index]; } } int main() { constexpr int N = 1000; int* x = (int*)malloc(N*sizeof(int)); int* y = (int*)malloc(N*sizeof(int)); for(int i = 0; i < N; ++i) { x[i] = i; y[i] = i*i; } // step 07 int* dx; int* dy; // 1. allocate on device size_t size = N * sizeof(int); cudaMalloc(&dx, size); cudaMalloc(&dy, size); // 2. copy from host to device cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice); cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice); // 3. launch CUDA kernel const int threads_per_bloc = 32; const int blocs = 8; add_strided<<>>(N, dx, dy); cudaDeviceSynchronize(); // 4. copy result from device to host cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost); // 5. free device memory cudaFree(dx); cudaFree(dy); // checking results bool ok = true; for(int i = 0; i < N; ++i) { const int expected_result = i + i*i; if(y[i] != expected_result) { std::cout << "Failure" << std::endl; std::cout << "Result at index i=" << i << ": expected " << i << '+' << i*i << '=' << expected_result << ", got " << y[i] << std::endl; ok = false; break; } } if(ok) std::cout << "Success" << std::endl; free(x); free(y); return 0; } // # Question 4 // Pour N tâches, X threads en tout, // - nous devons faire en moyenne N / X tâches par threads // - un stride valable est ceil(N / X)