CUDA并行编程学习（4）-- 内存共享 Share Memory

Sep 27, 2018 原创文章

基于CUDA的并行编程学习笔记

分享到：

请保证您的浏览器支持MathJax插件，以免数学公式无法显示

共享内存的优点
创建共享内存
例程：Matrix Multiplication
- 不使用共享内存
  - 算法说明
  - 代码清单
- 使用共享内存

共享内存的优点

使用共享内存的优点在于，一般情况下，共享内存比全局内存的访问速度更快。任何可以使用共享内存的地方，都应该将全局内存替换为共享内存。

创建共享内存

1、创建一个固定大小的共享数组


__shared__ float s_in[34];

2、动态申请数组

动态申请数组不需要对核函数进行修改，但是需要在申请数组前加上关键字 extern


extern __shared__ float s_in[];

例程：Matrix Multiplication

不使用共享内存

算法说明

矩阵A与矩阵B相乘得到矩阵C

如上图所示，每个线程读取矩阵A的一行和矩阵B的一列，计算相应的结果C。因此A从全局内存中读取width次B，B 从全局内存中读取Aheight次。

代码清单


// Matrices are stored in row-major order: 
// M(row, col) = *(M.elements + row * M.width + col) 
typedef struct {    
      int width;    
      int height;    
      float* elements; 
    } Matrix; 

// Thread block size 
#define BLOCK_SIZE 16 

// Forward declaration of the matrix multiplication kernel 
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix); 

// Matrix multiplication - Host code 
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE 
void MatMul(const Matrix A, const Matrix B, Matrix C) {    
  // Load A and B to device memory    
  Matrix d_A;    
  d_A.width = A.width; 
  d_A.height = A.height;    
  size_t size = A.width * A.height * sizeof(float);    
  cudaMalloc(&d_A.elements, size);    
  cudaMemcpy(d_A.elements, A.elements, size,               
  cudaMemcpyHostToDevice);    
  Matrix d_B;    
  d_B.width = B.width; 
  d_B.height = B.height;    
  size = B.width * B.height * sizeof(float);    
  cudaMalloc(&d_B.elements, size);    
  cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);    
  // Allocate C in device memory    
  Matrix d_C;    
  d_C.width = C.width; 
  d_C.height = C.height;    
  size = C.width * C.height * sizeof(float);    
  cudaMalloc(&d_C.elements, size);    
  // Invoke kernel    
  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);    
  dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);    
  MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);    

  // Read C from device memory    
  cudaMemcpy(C.elements, Cd.elements, size, cudaMemcpyDeviceToHost);    
  
  // Free device memory    
  cudaFree(d_A.elements);    
  cudaFree(d_B.elements);    
  cudaFree(d_C.elements); 
  } 
  
  // Matrix multiplication kernel called by MatMul()
   __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) {    
     // Each thread computes one element of C   
     // by accumulating results into Cvalue   
     float Cvalue = 0;    
     int row = blockIdx.y * blockDim.y + threadIdx.y;    
     int col = blockIdx.x * blockDim.x + threadIdx.x;    
     for (int e = 0; e < A.width; ++e)        
        Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col];    
      C.elements[row * C.width + col] = Cvalue;
   }

使用共享内存

算法说明

在使用共享内存的情况的情况下，每个线程块负责计算矩阵C的子矩阵$C_{sub}$，而线程块中的线程负责计算子矩阵$C_{sub}$中的元素。$C_{sub}$的计算方法如上图所示。$C_{sub}$可以看做是矩阵A和B的两个矩形子矩阵相乘。

为了方便计算和充分利用设备资源，要将两个矩阵尽可能的划分为维度为 block _size 的方阵。

在具体的代码实现中，为了计算，首先从全局内存中采用 一个线程加载一个元素 的方式，将每个方阵加载到共享内存中。

整个过程中，A从全局内存中读取 (B,width/block_size) 次，而B从全局矩阵中读取 (A.height/block_size) 次。

代码清单

这一部分代码定义了矩阵的数据结构，其中包括矩阵长和宽，以及矩阵的步长。



// Matrices are stored in row-major order: 
// M(row, col) = *(M.elements + row * M.stride + col) 
typedef struct {    
    int width;   
    int height;    
    int stride;     
    float* elements;
    } Matrix;

函数GetElement，实现了从矩阵的指定位置提取元素值。


// Get a matrix element 
__device__ float GetElement(const Matrix A, int row, int col) 
{   
   return A.elements[row * A.stride + col]; 
}

函数SetElement，实现了给矩阵的指定位置赋值。



// Set a matrix element 
__device__ void SetElement(Matrix A, int row, int col, float value) 
{    
  A.elements[row * A.stride + col] = value;
}

函数GetSubMatrix，实现了提取矩阵的指定大小的子矩阵



// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 
// located col sub-matrices to the right and row sub-matrices down 
// from the upper-left corner of A 
__device__ Matrix GetSubMatrix(Matrix A, int row, int col) 
{    
  Matrix Asub;    Asub.width    = BLOCK_SIZE;    
  Asub.height   = BLOCK_SIZE;    
  Asub.stride   = A.stride;    
  Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row  + BLOCK_SIZE * col];   
   return Asub;
 }

宏定义BLOCK_SIZE的大小


 // Thread block size 
 #define BLOCK_SIZE 16

声明函数


// Forward declaration of the matrix multiplication kernel 
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

实现矩阵相乘


// Matrix multiplication - Host code 
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE 
void MatMul(const Matrix A, const Matrix B, Matrix C) 
{    
  // Load A and B to device memory    
  Matrix d_A;    
  d_A.width = d_A.stride = A.width;
  d_A.height = A.height;    
  size_t size = A.width * A.height * sizeof(float);   
  cudaMalloc(&d_A.elements, size);   
  cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);    
  
  Matrix d_B;    
  d_B.width = d_B.stride = B.width; 
  d_B.height = B.height;   
  size = B.width * B.height * sizeof(float);
  cudaMalloc(&d_B.elements, size);    
  cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);    

// Allocate C in device memory   
  Matrix d_C;    
  d_C.width = d_C.stride = C.width; 
  d_C.height = C.height;    
  size = C.width * C.height * sizeof(float);   
  cudaMalloc(&d_C.elements, size);    
  
  // Invoke kernel    
  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);    
  dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);    
  MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);    

  // Read C from device memory    
  cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);  

  // Free device memory    
  cudaFree(d_A.elements);    
  cudaFree(d_B.elements);    
  cudaFree(d_C.elements); }

该例程的核函数


// Matrix multiplication kernel called by MatMul() 
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) 
{   
  // Block row and column    
  int blockRow = blockIdx.y;    
  int blockCol = blockIdx.x;    
  
  // Each thread block computes one sub-matrix Csub of C    
  Matrix Csub = GetSubMatrix(C, blockRow, blockCol);   

  // Each thread computes one element of Csub    
  // by accumulating results into Cvalue    
  float Cvalue = 0;    

  // Thread row and column within Csub    
  int row = threadIdx.y;    
  int col = threadIdx.x;   
   // Loop over all the sub-matrices of A and B that are    
   // required to compute Csub    
   // Multiply each pair of sub-matrices together   
    // and accumulate the results    
    for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {       
       // Get sub-matrix Asub of A        
       Matrix Asub = GetSubMatrix(A, blockRow, m);      

        // Get sub-matrix Bsub of B        
        Matrix Bsub = GetSubMatrix(B, m, blockCol);       

         // Shared memory used to store Asub and Bsub respectively        
         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];        
         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];        

         // Load Asub and Bsub from device memory to shared memory      
         // Each thread loads one element of each sub-matrix        
        As[row][col] = GetElement(Asub, row, col);       
        Bs[row][col] = GetElement(Bsub, row, col);        
        
        // Synchronize to make sure the sub-matrices are loaded       
         // before starting the computation        
         __syncthreads();

        // Multiply Asub and Bsub together        
        for (int e = 0; e < BLOCK_SIZE; ++e)            
            Cvalue += As[row][e] * Bs[e][col];        
            
            // Synchronize to make sure that the preceding        
            // computation is done before loading two new       
            // sub-matrices of A and B in the next iteration       
          __syncthreads();    
      }    
      // Write Csub to device memory    
      // Each thread writes one element    
      
      SetElement(Csub, row, col, Cvalue); 
  }