using LinearAlgebra using Plots pyplot() using BenchmarkTools using Printf using CUDA sizes = 2 .^ (12:2:28); N = convert(Array{Int128,1}, sqrt.(sizes)) timeCPU = Vector{Float32}(undef, length(sizes)); timeGPU = Vector{Float32}(undef, length(sizes)); for i = 1:length(sizes) # First on the CPU An = rand(Float32, N[i], N[i]) Bn = rand(Float32, N[i], N[i]) timeCPU[i] = @elapsed An * Bn GC.gc(true) # Now on the GPU Ac = CUDA.rand(N[i], N[i]) Bc = CUDA.rand(N[i], N[i]) timeGPU[i] = CUDA.@elapsed Ac * Bc GC.gc(true) CUDA.reclaim() end gflopsCPU = (2 * N .^ 3 - N .^ 2) ./ timeCPU / 1e9; gflopsGPU = (2 * N .^ 3 - N .^ 2) ./ timeGPU / 1e9; @printf( "Achieved peak calculation rates of %.1f GFLOPS on CPU, %.1f GFLOPS on GPU ", maximum(gflopsCPU), maximum(gflopsGPU) ) plot( sizes, gflopsCPU, lw = 2, legend = :topleft, xaxis = ("Matrix size (numel)", :log10), xlims = (10^3, 10^9), frame = true, label = string( "CPU (Max: ", round(maximum(gflopsCPU), digits = 2), " GFLOPs @Xeon E5-2650v4)", ), ); plot!( sizes, gflopsGPU, lw = 2, label = string( "GPU (Max: ", round(maximum(gflopsGPU), digits = 2), " GFLOPs @GTX 1080Ti)", ), ); plot!(yaxis = ("Calculation Rate (GFLOPS)")); plot!(title = ("Double precision matrix-matrix multiply")); plot!(minorxgrid = true, ylims = :round); scatter!( [sizes[argmax(gflopsCPU)], sizes[argmax(gflopsGPU)]], [maximum(gflopsCPU), maximum(gflopsGPU)], label = "", marker = (10, 0.3, [:blue, :red]), ); savefig("gpu_perf_result.png")