Support this channel at: buymeacoffee.c... More on Matrix Multiplication: • Matrix multiplication ... en.wikipedia.o... Code for animations and examples: github.com/Szy...
Hi, I've been watching these videos in addition to reading the Programming Massively Parallel Processors, My take on the exercise: (for the sake of brevity, I will not include assigning memory or memcpy for now) ```c // Kernel Function for Array Summing __global__ void sumArrays_Kernel(float *A, float *B, float *C, float *D, int Width, int Height, int Depth) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int z = blockIdx.z * blockDim.z + threadIdx.z; if (x < Width && y < Height && z < Depth) { int index = x + y * Width + z * Width * Height; // Defined as index as used twice in next line D[index] = A[index] + B[y * Width + x] + C[x]; } } void sumArrays_Host(float *A, float *B, float *C, float *D, int X, int Y, int Z) { float *A_d, *B_d, *C_d, *D_d; // Malloc and Memcpy vars (i.e A -> A_d) dim3 block(2, 2, 2); // I'm not massively sure on good sizing here dim3 grid((X + block.x - 1) / block.x, (Y + block.y - 1) / block.y, (Z + block.z - 1) / block.z); sumArraysKernel(d_A, d_B, d_C, d_D, X, Y, Z); // memcpy result back, and then free memory } ``` General idea is that we're using a different index for each input vector, based on the logic you were mentioning earlier, the block and grid logic is just making sure we're in bounds
This is a great series, thank you!
How does this have only 165 views, it's so good
ikr! Even 3 weeks later, it's not even at 1k :(
equations at 2:24 are incorrect
Hi, I've been watching these videos in addition to reading the Programming Massively Parallel Processors,
My take on the exercise: (for the sake of brevity, I will not include assigning memory or memcpy for now)
```c
// Kernel Function for Array Summing
__global__ void sumArrays_Kernel(float *A, float *B, float *C, float *D, int Width, int Height, int Depth) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
if (x < Width && y < Height && z < Depth) {
int index = x + y * Width + z * Width * Height; // Defined as index as used twice in next line
D[index] = A[index] + B[y * Width + x] + C[x];
}
}
void sumArrays_Host(float *A, float *B, float *C, float *D, int X, int Y, int Z) {
float *A_d, *B_d, *C_d, *D_d;
// Malloc and Memcpy vars (i.e A -> A_d)
dim3 block(2, 2, 2); // I'm not massively sure on good sizing here
dim3 grid((X + block.x - 1) / block.x,
(Y + block.y - 1) / block.y,
(Z + block.z - 1) / block.z);
sumArraysKernel(d_A, d_B, d_C, d_D, X, Y, Z);
// memcpy result back, and then free memory
}
```
General idea is that we're using a different index for each input vector, based on the logic you were mentioning earlier, the block and grid logic is just making sure we're in bounds