The code for week 1 can be found in this Google Colab notebook. Make sure that you are using the "T4 GPU" runtime when running the cells
!pip install pycuda
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from time import perf_counter
module = SourceModule('''
__global__ void multiply(float *dest, float *a, float *b) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
dest[i] = a[i] * b[i];
}
''')
multiply = module.get_function("multiply")
a = np.random.randn(1000000).astype(np.float32)
b = np.random.randn(1000000).astype(np.float32)
dest = np.zeros_like(a)
t0 = perf_counter()
multiply(cuda.Out(dest), cuda.In(a), cuda.In(b), block=(1000,1,1), grid=(1000,1,1))
t1 = perf_counter()
print(t1 - t0)
dest2 = np.zeros_like(a)
t0 = perf_counter()
for i in range(N):
dest2[i] = a[i] * b[i]
t1 = perf_counter()
print(t1 - t0)