diff --git a/gpu/tp5/c/build.sh b/gpu/tp5/c/build.sh
index 82404bf..22ebca9 100755
--- a/gpu/tp5/c/build.sh
+++ b/gpu/tp5/c/build.sh
@@ -13,6 +13,7 @@ fi
 
 mkdir -p bin
 
+cc=hipcc
 ccargs="-O2"
 #ccargs="$ccargs -g -G -Xcompiler -fsanitize=address"
 
@@ -21,5 +22,5 @@ do
 	sources="$MODULES $target"
 	inputs="$(for src in $sources; do echo "src/$src"; done | xargs)"
 	rm -f           bin/${target}.out
-	nvcc $ccargs -o bin/${target}.out $modules $inputs
+	$cc $ccargs -o bin/${target}.out $modules $inputs
 done
diff --git a/gpu/tp5/c/src/matrix.cu b/gpu/tp5/c/src/matrix.cu
index 24f5d9c..5e5a350 100644
--- a/gpu/tp5/c/src/matrix.cu
+++ b/gpu/tp5/c/src/matrix.cu
@@ -163,24 +163,56 @@ namespace kernel {
 // step 05
 // return the 1D index of a row-major matrix of size (rows,cols) from 2D indices (i,j) inside sub-matrix (bi,bj)
 //
-__device__ int index2(int i, int j, int bi, int bj, int rows, int cols) {
-	auto local_x = j;
-	auto local_y = i;
+__device__ int index2(int y, int x, int by, int bx, int height, int width) {
+	auto local_x = x;
+	auto local_y = y;
 
 	auto local_matrix_width = T;
-	auto base_x = bj * local_matrix_width;
-	auto base_y = bi * local_matrix_width;
+	auto base_x = bx * local_matrix_width;
+	auto base_y = by * local_matrix_width;
 
-	auto x = base_x + local_x;
-	auto y = base_y + local_y;
-	return index1(y, x, rows, cols);
+	auto global_x = base_x + local_x;
+	auto global_y = base_y + local_y;
+	return index1(global_y, global_x, height, width);
 }
 
 //
 // step 06
 //
 __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P) {
-	auto step_count = (); //
+
+	auto A_height = N;
+	auto A_width = M;
+
+	auto B_height = A_width;
+	auto B_width = P;
+
+	auto result_height = A_height;
+	auto result_width = B_width;
+
+	auto result = 0;
+	auto thread_x = threadIdx.x;
+	auto thread_y = threadIdx.y;
+	auto step_count = (A_width / T) + 1;
+	for (RANGE(step_index, 0, step_count)) {
+		__shared__ int local_A[T][T];
+		__shared__ int local_B[T][T];
+		auto index_A = index2(thread_y, thread_x, blockIdx.y, step_index, A_height, A_width);
+		auto index_B = index2(thread_y, thread_x, step_index, blockIdx.x, B_height, B_width);
+		if (index_A == -1) return;
+		if (index_B == -1) return;
+		local_A[thread_x][thread_y] = A[index_A];
+		local_B[thread_x][thread_y] = B[index_B];
+		__syncthreads();
+		for (RANGE(i, 0, T)) {
+			auto a = local_A[i][thread_y];
+			auto b = local_B[thread_x][i];
+			result += a * b;
+		}
+		__syncthreads();
+	}
+	auto index = index1(THREAD_GID(y), THREAD_GID(x), result_height, result_width);
+	C[index] = result;
 }
 
 } // namespace kernel
@@ -190,9 +222,28 @@ __global__ void matmul3(const int* A, const int* B, int* C, int N, int M, int P)
 //
 std::vector<int> matmul3(const std::vector<int>& A, const std::vector<int>& B, int N, int M, int P) {
 	//
-	// step 07
+	// step 04
 	//
-	std::vector<int> C(N * P);
+	auto A_height = N;
+	auto A_width = M;
+	auto A_dev = cuda_malloc_copy(A.data(), A_width * A_height);
 
-	return C;
+	auto B_height = A_width;
+	auto B_width = P;
+	auto B_dev = cuda_malloc_copy(B.data(), B_width * B_height);
+
+	auto result_height = A_height;
+	auto result_width = B_width;
+	auto result_dev = cuda_malloc<int>(A_height * B_width);
+
+	auto grid_dim = dim3(result_width / threads_per_bloc + 1, result_height / threads_per_bloc + 1, 1);
+	auto block_dim = dim3(threads_per_bloc, threads_per_bloc, 1);
+	kernel::matmul3<<<grid_dim, block_dim>>>(A_dev, B_dev, result_dev, A_height, A_width, B_width);
+
+	CUDA_CHECK(cudaFree(A_dev));
+	CUDA_CHECK(cudaFree(B_dev));
+	auto result = cuda_into_host(result_dev, result_width * result_height);
+	cudaFree(result_dev);
+
+	return result;
 }