| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 
 | #include <stdio.h>#include <stdlib.h>
 #include <cublas_v2.h>
 #include <device_launch_parameters.h>
 
 const int M = 2048;
 const int N = 2048;
 const int K = 4096;
 float alpha = 1.0f;
 float beta = 0.5f;
 const int ITER = 1000;
 
 const int BM = 64;
 const int BN = 64;
 const int BK = 8;
 const int TM = 8;
 
 template<const int BM, const int BN, const int BK, const int TM>
 __global__ void CacheGEMM(float* A, float* B, float* C, int M, int N, int K, float Alpha, float Beta)
 {
 const unsigned int cRow = blockIdx.y;
 const unsigned int cCol = blockIdx.x;
 
 float threadResult[TM]{ 0 };
 
 const int threadCol = threadIdx.x % BN;
 const int threadRow = threadIdx.x / BN;
 
 __shared__ float As[BM * BK];
 __shared__ float Bs[BK * BN];
 
 A += cRow * BM * K;
 B += cCol * BN;
 C += cRow * BM * N + cCol * BN;
 
 const unsigned int innerRowA = threadIdx.x / BK;
 const unsigned int innerColA = threadIdx.x % BK;
 const unsigned int innerRowB = threadIdx.x / BN;
 const unsigned int innerColB = threadIdx.x % BN;
 
 for (int blkIdx = 0; blkIdx < K; blkIdx += BK)
 {
 As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
 Bs[innerRowB * BN + innerColB] = B[innerRowB * N + innerColB];
 __syncthreads();
 
 A += BK;
 B += BK * N;
 
 for (int dotIdx = 0; dotIdx < BK; ++dotIdx)
 {
 float tmpB = Bs[dotIdx * BN + threadCol];
 for (int resIdx = 0; resIdx < TM; ++resIdx)
 {
 threadResult[resIdx] += As[(threadRow * TM + resIdx) * BK + dotIdx] * tmpB;
 }
 }
 __syncthreads();
 }
 
 for (int resIdx = 0; resIdx < TM; ++resIdx) {
 C[(threadRow * TM + resIdx) * N + threadCol] =
 Alpha * threadResult[resIdx] +
 Beta * C[(threadRow * TM + resIdx) * N + threadCol];
 }
 }
 
 int main()
 {
 cudaError_t cudaStat;
 
 float* d_a, * d_b, * d_c;
 cudaMalloc((void**)&d_a, M * K * sizeof(float));
 cudaMalloc((void**)&d_b, K * N * sizeof(float));
 cudaMalloc((void**)&d_c, M * N * sizeof(float));
 
 cudaEvent_t start, end;
 cudaEventCreate(&start);
 cudaEventCreate(&end);
 
 cudaEventRecord(start);
 
 dim3 blockDim(BM * BN / TM);
 dim3 gridDim((N + BN - 1) / BN, (M + BM - 1) / BM);
 
 for (int i = 0; i < ITER; i++)
 {
 CacheGEMM <BM, BN, BK, TM><<<gridDim, blockDim>>> (d_a, d_b, d_c, M, N, K, alpha, beta);
 }
 cudaEventRecord(end);
 cudaEventSynchronize(end);
 
 float msec = 0.f;
 cudaEventElapsedTime(&msec, start, end);
 
 long long workfload = long long(M) * N * K * 2 * ITER;
 double avg_GFlops = (double(workfload) / 1e9) / (double(msec) / 1e3);
 printf_s("AveragePerformance %10.11f GFlops\n", avg_GFlops);
 
 cudaFree(d_a);
 cudaFree(d_b);
 cudaFree(d_c);
 }
 
 |