1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
| #include <stdio.h> #include <stdlib.h> #include <cublas_v2.h> #include <device_launch_parameters.h>
const int M = 2048; const int N = 2048; const int K = 4096; float alpha = 1.0f; float beta = 0.5f; const int ITER = 1000;
const int BM = 64; const int BN = 64; const int BK = 8; const int TM = 8;
template<const int BM, const int BN, const int BK, const int TM> __global__ void CacheGEMM(float* A, float* B, float* C, int M, int N, int K, float Alpha, float Beta) { const unsigned int cRow = blockIdx.y; const unsigned int cCol = blockIdx.x;
float threadResult[TM]{ 0 };
const int threadCol = threadIdx.x % BN; const int threadRow = threadIdx.x / BN;
__shared__ float As[BM * BK]; __shared__ float Bs[BK * BN];
A += cRow * BM * K; B += cCol * BN; C += cRow * BM * N + cCol * BN;
const unsigned int innerRowA = threadIdx.x / BK; const unsigned int innerColA = threadIdx.x % BK; const unsigned int innerRowB = threadIdx.x / BN; const unsigned int innerColB = threadIdx.x % BN;
for (int blkIdx = 0; blkIdx < K; blkIdx += BK) { As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA]; Bs[innerRowB * BN + innerColB] = B[innerRowB * N + innerColB]; __syncthreads();
A += BK; B += BK * N; for (int dotIdx = 0; dotIdx < BK; ++dotIdx) { float tmpB = Bs[dotIdx * BN + threadCol]; for (int resIdx = 0; resIdx < TM; ++resIdx) { threadResult[resIdx] += As[(threadRow * TM + resIdx) * BK + dotIdx] * tmpB; } } __syncthreads(); }
for (int resIdx = 0; resIdx < TM; ++resIdx) { C[(threadRow * TM + resIdx) * N + threadCol] = Alpha * threadResult[resIdx] + Beta * C[(threadRow * TM + resIdx) * N + threadCol]; } }
int main() { cudaError_t cudaStat;
float* d_a, * d_b, * d_c; cudaMalloc((void**)&d_a, M * K * sizeof(float)); cudaMalloc((void**)&d_b, K * N * sizeof(float)); cudaMalloc((void**)&d_c, M * N * sizeof(float));
cudaEvent_t start, end; cudaEventCreate(&start); cudaEventCreate(&end);
cudaEventRecord(start);
dim3 blockDim(BM * BN / TM); dim3 gridDim((N + BN - 1) / BN, (M + BM - 1) / BM);
for (int i = 0; i < ITER; i++) { CacheGEMM <BM, BN, BK, TM><<<gridDim, blockDim>>> (d_a, d_b, d_c, M, N, K, alpha, beta); } cudaEventRecord(end); cudaEventSynchronize(end);
float msec = 0.f; cudaEventElapsedTime(&msec, start, end);
long long workfload = long long(M) * N * K * 2 * ITER; double avg_GFlops = (double(workfload) / 1e9) / (double(msec) / 1e3); printf_s("AveragePerformance %10.11f GFlops\n", avg_GFlops);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); }
|